diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-25 13:00:40 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-25 13:00:40 -0500 |
commit | 2a0af7fe460eb46f9af996075972bf7c2e3f211d (patch) | |
tree | dc99ebbf913c05e67796401ebbd1cabe4fad349b /src/backend/regex/regc_lex.c | |
parent | 6b40d9bdbdc9f873868b0ddecacd9a307fc8ee26 (diff) | |
download | postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.tar.gz postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.zip |
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within
bracket expressions. There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.
Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets. POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.
One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".
Some implementation cleanup along the way:
* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.
* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.
* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does? Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.
Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regc_lex.c')
-rw-r--r-- | src/backend/regex/regc_lex.c | 166 |
1 files changed, 16 insertions, 150 deletions
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 16664531641..7673dab76f4 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -194,83 +194,6 @@ prefixes(struct vars *v) } /* - * lexnest - "call a subroutine", interpolating string at the lexical level - * - * Note, this is not a very general facility. There are a number of - * implicit assumptions about what sorts of strings can be subroutines. - */ -static void -lexnest(struct vars *v, - const chr *beginp, /* start of interpolation */ - const chr *endp) /* one past end of interpolation */ -{ - assert(v->savenow == NULL); /* only one level of nesting */ - v->savenow = v->now; - v->savestop = v->stop; - v->now = beginp; - v->stop = endp; -} - -/* - * string constants to interpolate as expansions of things like \d - */ -static const chr backd[] = { /* \d */ - CHR('['), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backD[] = { /* \D */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbackd[] = { /* \d within brackets */ - CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']') -}; -static const chr backs[] = { /* \s */ - CHR('['), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backS[] = { /* \S */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbacks[] = { /* \s within brackets */ - CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']') -}; -static const chr backw[] = { /* \w */ - CHR('['), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr backW[] = { /* \W */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr brbackw[] = { /* \w within brackets */ - CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_') -}; - -/* - * lexword - interpolate a bracket expression for word characters - * Possibly ought to inquire whether there is a "word" character class. - */ -static void -lexword(struct vars *v) -{ - lexnest(v, backw, ENDOF(backw)); -} - -/* * next - get next token */ static int /* 1 normal, 0 failure */ @@ -292,14 +215,6 @@ next(struct vars *v) RETV(SBEGIN, 0); /* same as \A */ } - /* if we're nested and we've hit end, return to outer level */ - if (v->savenow != NULL && ATEOS()) - { - v->now = v->savenow; - v->stop = v->savestop; - v->savenow = v->savestop = NULL; - } - /* skip white space etc. if appropriate (not in literal or []) */ if (v->cflags & REG_EXPANDED) switch (v->lexcon) @@ -420,32 +335,15 @@ next(struct vars *v) NOTE(REG_UNONPOSIX); if (ATEOS()) FAILW(REG_EESCAPE); - (DISCARD) lexescape(v); + if (!lexescape(v)) + return 0; switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: + case CCLASSS: + case CCLASSC: return 1; break; - case CCLASS: - switch (v->nextvalue) - { - case 'd': - lexnest(v, brbackd, ENDOF(brbackd)); - break; - case 's': - lexnest(v, brbacks, ENDOF(brbacks)); - break; - case 'w': - lexnest(v, brbackw, ENDOF(brbackw)); - break; - default: - FAILW(REG_EESCAPE); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - break; } /* not one of the acceptable escapes */ FAILW(REG_EESCAPE); @@ -691,49 +589,17 @@ next(struct vars *v) } RETV(PLAIN, *v->now++); } - (DISCARD) lexescape(v); - if (ISERR()) - FAILW(REG_EESCAPE); - if (v->nexttype == CCLASS) - { /* fudge at lexical level */ - switch (v->nextvalue) - { - case 'd': - lexnest(v, backd, ENDOF(backd)); - break; - case 'D': - lexnest(v, backD, ENDOF(backD)); - break; - case 's': - lexnest(v, backs, ENDOF(backs)); - break; - case 'S': - lexnest(v, backS, ENDOF(backS)); - break; - case 'w': - lexnest(v, backw, ENDOF(backw)); - break; - case 'W': - lexnest(v, backW, ENDOF(backW)); - break; - default: - assert(NOTREACHED); - FAILW(REG_ASSERT); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - } - /* otherwise, lexescape has already done the work */ - return !ISERR(); + return lexescape(v); } /* * lexescape - parse an ARE backslash escape (backslash already eaten) - * Note slightly nonstandard use of the CCLASS type code. + * + * This is used for ARE backslashes both normally and inside bracket + * expressions. In the latter case, not all escape types are allowed, + * but the caller must reject unwanted ones after we return. */ -static int /* not actually used, but convenient for RETV */ +static int lexescape(struct vars *v) { chr c; @@ -775,11 +641,11 @@ lexescape(struct vars *v) break; case CHR('d'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'd'); + RETV(CCLASSS, CC_DIGIT); break; case CHR('D'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'D'); + RETV(CCLASSC, CC_DIGIT); break; case CHR('e'): NOTE(REG_UUNPORT); @@ -802,11 +668,11 @@ lexescape(struct vars *v) break; case CHR('s'): NOTE(REG_ULOCALE); - RETV(CCLASS, 's'); + RETV(CCLASSS, CC_SPACE); break; case CHR('S'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'S'); + RETV(CCLASSC, CC_SPACE); break; case CHR('t'): RETV(PLAIN, CHR('\t')); @@ -828,11 +694,11 @@ lexescape(struct vars *v) break; case CHR('w'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'w'); + RETV(CCLASSS, CC_WORD); break; case CHR('W'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'W'); + RETV(CCLASSC, CC_WORD); break; case CHR('x'): NOTE(REG_UUNPORT); |