diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-25 13:00:40 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-25 13:00:40 -0500 |
commit | 2a0af7fe460eb46f9af996075972bf7c2e3f211d (patch) | |
tree | dc99ebbf913c05e67796401ebbd1cabe4fad349b /src/backend/regex/regc_locale.c | |
parent | 6b40d9bdbdc9f873868b0ddecacd9a307fc8ee26 (diff) | |
download | postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.tar.gz postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.zip |
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within
bracket expressions. There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.
Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets. POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.
One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".
Some implementation cleanup along the way:
* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.
* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.
* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does? Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.
Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regc_locale.c')
-rw-r--r-- | src/backend/regex/regc_locale.c | 97 |
1 files changed, 51 insertions, 46 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 047abc3e1e7..b5f3a73b1bb 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -350,17 +350,13 @@ static const struct cname }; /* - * The following arrays define the valid character class names. + * The following array defines the valid character class names. + * The entries must match enum char_classes in regguts.h. */ static const char *const classNames[NUM_CCLASSES + 1] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", - "lower", "print", "punct", "space", "upper", "xdigit", NULL -}; - -enum classes -{ - CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, - CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT + "lower", "print", "punct", "space", "upper", "xdigit", "word", + NULL }; /* @@ -536,54 +532,58 @@ eclass(struct vars *v, /* context */ } /* - * cclass - supply cvec for a character class - * - * Must include case counterparts if "cases" is true. + * lookupcclass - lookup a character class identified by name * - * The returned cvec might be either a transient cvec gotten from getcvec(), - * or a permanently cached one from pg_ctype_get_cache(). This is okay - * because callers are not supposed to explicitly free the result either way. + * On failure, sets an error code in *v; the result is then garbage. */ -static struct cvec * -cclass(struct vars *v, /* context */ - const chr *startp, /* where the name starts */ - const chr *endp, /* just past the end of the name */ - int cases) /* case-independent? */ +static enum char_classes +lookupcclass(struct vars *v, /* context (for returning errors) */ + const chr *startp, /* where the name starts */ + const chr *endp) /* just past the end of the name */ { size_t len; - struct cvec *cv = NULL; const char *const *namePtr; - int i, - index; + int i; /* * Map the name to the corresponding enumerated value. */ len = endp - startp; - index = -1; for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) { if (strlen(*namePtr) == len && pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) - { - index = i; - break; - } - } - if (index == -1) - { - ERR(REG_ECTYPE); - return NULL; + return (enum char_classes) i; } + ERR(REG_ECTYPE); + return (enum char_classes) 0; +} + +/* + * cclasscvec - supply cvec for a character class + * + * Must include case counterparts if "cases" is true. + * + * The returned cvec might be either a transient cvec gotten from getcvec(), + * or a permanently cached one from pg_ctype_get_cache(). This is okay + * because callers are not supposed to explicitly free the result either way. + */ +static struct cvec * +cclasscvec(struct vars *v, /* context */ + enum char_classes cclasscode, /* class to build a cvec for */ + int cases) /* case-independent? */ +{ + struct cvec *cv = NULL; + /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && - ((enum classes) index == CC_LOWER || - (enum classes) index == CC_UPPER)) - index = (int) CC_ALPHA; + (cclasscode == CC_LOWER || + cclasscode == CC_UPPER)) + cclasscode = CC_ALPHA; /* * Now compute the character class contents. For classes that are based @@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */ * NB: keep this code in sync with cclass_column_index(), below. */ - switch ((enum classes) index) + switch (cclasscode) { case CC_PRINT: - cv = pg_ctype_get_cache(pg_wc_isprint, index); + cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); break; case CC_ALNUM: - cv = pg_ctype_get_cache(pg_wc_isalnum, index); + cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); break; case CC_ALPHA: - cv = pg_ctype_get_cache(pg_wc_isalpha, index); + cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); + break; + case CC_WORD: + cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); break; case CC_ASCII: /* hard-wired meaning */ @@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */ addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: - cv = pg_ctype_get_cache(pg_wc_isdigit, index); + cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); break; case CC_PUNCT: - cv = pg_ctype_get_cache(pg_wc_ispunct, index); + cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); break; case CC_XDIGIT: @@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */ } break; case CC_SPACE: - cv = pg_ctype_get_cache(pg_wc_isspace, index); + cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); break; case CC_LOWER: - cv = pg_ctype_get_cache(pg_wc_islower, index); + cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); break; case CC_UPPER: - cv = pg_ctype_get_cache(pg_wc_isupper, index); + cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); break; case CC_GRAPH: - cv = pg_ctype_get_cache(pg_wc_isgraph, index); + cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); break; } @@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c) /* * Note: we should not see requests to consider cclasses that are not - * treated as locale-specific by cclass(), above. + * treated as locale-specific by cclasscvec(), above. */ if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) colnum |= cm->classbits[CC_PRINT]; @@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c) colnum |= cm->classbits[CC_ALNUM]; if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) colnum |= cm->classbits[CC_ALPHA]; + if (cm->classbits[CC_WORD] && pg_wc_isword(c)) + colnum |= cm->classbits[CC_WORD]; assert(cm->classbits[CC_ASCII] == 0); assert(cm->classbits[CC_BLANK] == 0); assert(cm->classbits[CC_CNTRL] == 0); |