aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regc_color.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-25 13:00:40 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-25 13:00:40 -0500
commit2a0af7fe460eb46f9af996075972bf7c2e3f211d (patch)
treedc99ebbf913c05e67796401ebbd1cabe4fad349b /src/backend/regex/regc_color.c
parent6b40d9bdbdc9f873868b0ddecacd9a307fc8ee26 (diff)
downloadpostgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.tar.gz
postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.zip
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regc_color.c')
-rw-r--r--src/backend/regex/regc_color.c34
1 files changed, 30 insertions, 4 deletions
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
index 0864011cce1..30bda0e5ad0 100644
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -936,7 +936,16 @@ okcolors(struct nfa *nfa,
}
else if (cd->nschrs == 0 && cd->nuchrs == 0)
{
- /* parent empty, its arcs change color to subcolor */
+ /*
+ * Parent is now empty, so just change all its arcs to the
+ * subcolor, then free the parent.
+ *
+ * It is not obvious that simply relabeling the arcs like this is
+ * OK; it appears to risk creating duplicate arcs. We are
+ * basically relying on the assumption that processing of a
+ * bracket expression can't create arcs of both a color and its
+ * subcolor between the bracket's endpoints.
+ */
cd->sub = NOSUB;
scd = &cm->cd[sco];
assert(scd->nschrs > 0 || scd->nuchrs > 0);
@@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa,
struct colordesc *cd;
struct colordesc *end = CDEND(cm);
color co;
+ struct arc *a;
assert(of != from);
@@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa,
if (findarc(of, PLAIN, RAINBOW) != NULL)
return;
+ /* Otherwise, transiently mark the colors that appear in of's out-arcs */
+ for (a = of->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type == PLAIN)
+ {
+ assert(a->co >= 0);
+ cd = &cm->cd[a->co];
+ assert(!UNUSEDCOLOR(cd));
+ cd->flags |= COLMARK;
+ }
+ }
+
+ /* Scan colors, clear transient marks, add arcs for unmarked colors */
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
- if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
- if (findarc(of, PLAIN, co) == NULL)
- newarc(nfa, type, co, from, to);
+ {
+ if (cd->flags & COLMARK)
+ cd->flags &= ~COLMARK;
+ else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+ newarc(nfa, type, co, from, to);
+ }
}