aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regc_locale.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-25 13:00:40 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-25 13:00:40 -0500
commit2a0af7fe460eb46f9af996075972bf7c2e3f211d (patch)
treedc99ebbf913c05e67796401ebbd1cabe4fad349b /src/backend/regex/regc_locale.c
parent6b40d9bdbdc9f873868b0ddecacd9a307fc8ee26 (diff)
downloadpostgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.tar.gz
postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.zip
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regc_locale.c')
-rw-r--r--src/backend/regex/regc_locale.c97
1 files changed, 51 insertions, 46 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 047abc3e1e7..b5f3a73b1bb 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -350,17 +350,13 @@ static const struct cname
};
/*
- * The following arrays define the valid character class names.
+ * The following array defines the valid character class names.
+ * The entries must match enum char_classes in regguts.h.
*/
static const char *const classNames[NUM_CCLASSES + 1] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
- "lower", "print", "punct", "space", "upper", "xdigit", NULL
-};
-
-enum classes
-{
- CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
- CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+ "lower", "print", "punct", "space", "upper", "xdigit", "word",
+ NULL
};
/*
@@ -536,54 +532,58 @@ eclass(struct vars *v, /* context */
}
/*
- * cclass - supply cvec for a character class
- *
- * Must include case counterparts if "cases" is true.
+ * lookupcclass - lookup a character class identified by name
*
- * The returned cvec might be either a transient cvec gotten from getcvec(),
- * or a permanently cached one from pg_ctype_get_cache(). This is okay
- * because callers are not supposed to explicitly free the result either way.
+ * On failure, sets an error code in *v; the result is then garbage.
*/
-static struct cvec *
-cclass(struct vars *v, /* context */
- const chr *startp, /* where the name starts */
- const chr *endp, /* just past the end of the name */
- int cases) /* case-independent? */
+static enum char_classes
+lookupcclass(struct vars *v, /* context (for returning errors) */
+ const chr *startp, /* where the name starts */
+ const chr *endp) /* just past the end of the name */
{
size_t len;
- struct cvec *cv = NULL;
const char *const *namePtr;
- int i,
- index;
+ int i;
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
- index = -1;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
- {
- index = i;
- break;
- }
- }
- if (index == -1)
- {
- ERR(REG_ECTYPE);
- return NULL;
+ return (enum char_classes) i;
}
+ ERR(REG_ECTYPE);
+ return (enum char_classes) 0;
+}
+
+/*
+ * cclasscvec - supply cvec for a character class
+ *
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache(). This is okay
+ * because callers are not supposed to explicitly free the result either way.
+ */
+static struct cvec *
+cclasscvec(struct vars *v, /* context */
+ enum char_classes cclasscode, /* class to build a cvec for */
+ int cases) /* case-independent? */
+{
+ struct cvec *cv = NULL;
+
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
- ((enum classes) index == CC_LOWER ||
- (enum classes) index == CC_UPPER))
- index = (int) CC_ALPHA;
+ (cclasscode == CC_LOWER ||
+ cclasscode == CC_UPPER))
+ cclasscode = CC_ALPHA;
/*
* Now compute the character class contents. For classes that are based
@@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */
* NB: keep this code in sync with cclass_column_index(), below.
*/
- switch ((enum classes) index)
+ switch (cclasscode)
{
case CC_PRINT:
- cv = pg_ctype_get_cache(pg_wc_isprint, index);
+ cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
break;
case CC_ALNUM:
- cv = pg_ctype_get_cache(pg_wc_isalnum, index);
+ cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
break;
case CC_ALPHA:
- cv = pg_ctype_get_cache(pg_wc_isalpha, index);
+ cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
+ break;
+ case CC_WORD:
+ cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
break;
case CC_ASCII:
/* hard-wired meaning */
@@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
- cv = pg_ctype_get_cache(pg_wc_isdigit, index);
+ cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
break;
case CC_PUNCT:
- cv = pg_ctype_get_cache(pg_wc_ispunct, index);
+ cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
break;
case CC_XDIGIT:
@@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */
}
break;
case CC_SPACE:
- cv = pg_ctype_get_cache(pg_wc_isspace, index);
+ cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
break;
case CC_LOWER:
- cv = pg_ctype_get_cache(pg_wc_islower, index);
+ cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
break;
case CC_UPPER:
- cv = pg_ctype_get_cache(pg_wc_isupper, index);
+ cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
break;
case CC_GRAPH:
- cv = pg_ctype_get_cache(pg_wc_isgraph, index);
+ cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
break;
}
@@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)
/*
* Note: we should not see requests to consider cclasses that are not
- * treated as locale-specific by cclass(), above.
+ * treated as locale-specific by cclasscvec(), above.
*/
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
colnum |= cm->classbits[CC_PRINT];
@@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
colnum |= cm->classbits[CC_ALNUM];
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
colnum |= cm->classbits[CC_ALPHA];
+ if (cm->classbits[CC_WORD] && pg_wc_isword(c))
+ colnum |= cm->classbits[CC_WORD];
assert(cm->classbits[CC_ASCII] == 0);
assert(cm->classbits[CC_BLANK] == 0);
assert(cm->classbits[CC_CNTRL] == 0);