aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/src/sgml/func.sgml25
-rw-r--r--src/backend/regex/re_syntax.n13
-rw-r--r--src/backend/regex/regc_color.c34
-rw-r--r--src/backend/regex/regc_lex.c166
-rw-r--r--src/backend/regex/regc_locale.c97
-rw-r--r--src/backend/regex/regc_pg_locale.c9
-rw-r--r--src/backend/regex/regcomp.c285
-rw-r--r--src/include/regex/regguts.h20
-rw-r--r--src/test/modules/test_regex/expected/test_regex.out250
-rw-r--r--src/test/modules/test_regex/sql/test_regex.sql44
10 files changed, 672 insertions, 271 deletions
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index d8224272a57..860ae118264 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
non-ASCII characters to belong to any of these classes.)
In addition to these standard character
classes, <productname>PostgreSQL</productname> defines
+ the <literal>word</literal> character class, which is the same as
+ <literal>alnum</literal> plus the underscore (<literal>_</literal>)
+ character, and
the <literal>ascii</literal> character class, which contains exactly
the 7-bit ASCII set.
</para>
@@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
matching empty strings at the beginning
and end of a word respectively. A word is defined as a sequence
of word characters that is neither preceded nor followed by word
- characters. A word character is an <literal>alnum</literal> character (as
- defined by the <acronym>POSIX</acronym> character class described above)
- or an underscore. This is an extension, compatible with but not
+ characters. A word character is any character belonging to the
+ <literal>word</literal> character class, that is, any letter, digit,
+ or underscore. This is an extension, compatible with but not
specified by <acronym>POSIX</acronym> 1003.2, and should be used with
caution in software intended to be portable to other systems.
The constraint escapes described below are usually preferable; they
@@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
<row>
<entry> <literal>\w</literal> </entry>
- <entry> <literal>[[:alnum:]_]</literal>
- (note underscore is included) </entry>
+ <entry> <literal>[[:word:]]</literal> </entry>
</row>
<row>
@@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
<row>
<entry> <literal>\W</literal> </entry>
- <entry> <literal>[^[:alnum:]_]</literal>
- (note underscore is included) </entry>
+ <entry> <literal>[^[:word:]]</literal> </entry>
</row>
</tbody>
</tgroup>
</table>
<para>
- Within bracket expressions, <literal>\d</literal>, <literal>\s</literal>,
- and <literal>\w</literal> lose their outer brackets,
- and <literal>\D</literal>, <literal>\S</literal>, and <literal>\W</literal> are illegal.
- (So, for example, <literal>[a-c\d]</literal> is equivalent to
+ The class-shorthand escapes also work within bracket expressions,
+ although the definitions shown above are not quite syntactically
+ valid in that context.
+ For example, <literal>[a-c\d]</literal> is equivalent to
<literal>[a-c[:digit:]]</literal>.
- Also, <literal>[a-c\D]</literal>, which is equivalent to
- <literal>[a-c^[:digit:]]</literal>, is illegal.)
</para>
<table id="posix-constraint-escapes-table">
diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n
index 4621bfc25f4..1afaa7cce7c 100644
--- a/src/backend/regex/re_syntax.n
+++ b/src/backend/regex/re_syntax.n
@@ -519,15 +519,10 @@ character classes:
(note underscore)
.RE
.PP
-Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
-and `\fB\ew\fR'\&
-lose their outer brackets,
-and `\fB\eD\fR', `\fB\eS\fR',
-and `\fB\eW\fR'\&
-are illegal.
-.VS 8.2
-(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
-Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
+The class-shorthand escapes also work within bracket expressions,
+although the definitions shown above are not quite syntactically
+valid in that context.
+For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
.VE 8.2
.PP
A constraint escape (AREs only) is a constraint,
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
index 0864011cce1..30bda0e5ad0 100644
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -936,7 +936,16 @@ okcolors(struct nfa *nfa,
}
else if (cd->nschrs == 0 && cd->nuchrs == 0)
{
- /* parent empty, its arcs change color to subcolor */
+ /*
+ * Parent is now empty, so just change all its arcs to the
+ * subcolor, then free the parent.
+ *
+ * It is not obvious that simply relabeling the arcs like this is
+ * OK; it appears to risk creating duplicate arcs. We are
+ * basically relying on the assumption that processing of a
+ * bracket expression can't create arcs of both a color and its
+ * subcolor between the bracket's endpoints.
+ */
cd->sub = NOSUB;
scd = &cm->cd[sco];
assert(scd->nschrs > 0 || scd->nuchrs > 0);
@@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa,
struct colordesc *cd;
struct colordesc *end = CDEND(cm);
color co;
+ struct arc *a;
assert(of != from);
@@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa,
if (findarc(of, PLAIN, RAINBOW) != NULL)
return;
+ /* Otherwise, transiently mark the colors that appear in of's out-arcs */
+ for (a = of->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type == PLAIN)
+ {
+ assert(a->co >= 0);
+ cd = &cm->cd[a->co];
+ assert(!UNUSEDCOLOR(cd));
+ cd->flags |= COLMARK;
+ }
+ }
+
+ /* Scan colors, clear transient marks, add arcs for unmarked colors */
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
- if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
- if (findarc(of, PLAIN, co) == NULL)
- newarc(nfa, type, co, from, to);
+ {
+ if (cd->flags & COLMARK)
+ cd->flags &= ~COLMARK;
+ else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+ newarc(nfa, type, co, from, to);
+ }
}
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 16664531641..7673dab76f4 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -194,83 +194,6 @@ prefixes(struct vars *v)
}
/*
- * lexnest - "call a subroutine", interpolating string at the lexical level
- *
- * Note, this is not a very general facility. There are a number of
- * implicit assumptions about what sorts of strings can be subroutines.
- */
-static void
-lexnest(struct vars *v,
- const chr *beginp, /* start of interpolation */
- const chr *endp) /* one past end of interpolation */
-{
- assert(v->savenow == NULL); /* only one level of nesting */
- v->savenow = v->now;
- v->savestop = v->stop;
- v->now = beginp;
- v->stop = endp;
-}
-
-/*
- * string constants to interpolate as expansions of things like \d
- */
-static const chr backd[] = { /* \d */
- CHR('['), CHR('['), CHR(':'),
- CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
- CHR(':'), CHR(']'), CHR(']')
-};
-static const chr backD[] = { /* \D */
- CHR('['), CHR('^'), CHR('['), CHR(':'),
- CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
- CHR(':'), CHR(']'), CHR(']')
-};
-static const chr brbackd[] = { /* \d within brackets */
- CHR('['), CHR(':'),
- CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
- CHR(':'), CHR(']')
-};
-static const chr backs[] = { /* \s */
- CHR('['), CHR('['), CHR(':'),
- CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
- CHR(':'), CHR(']'), CHR(']')
-};
-static const chr backS[] = { /* \S */
- CHR('['), CHR('^'), CHR('['), CHR(':'),
- CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
- CHR(':'), CHR(']'), CHR(']')
-};
-static const chr brbacks[] = { /* \s within brackets */
- CHR('['), CHR(':'),
- CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
- CHR(':'), CHR(']')
-};
-static const chr backw[] = { /* \w */
- CHR('['), CHR('['), CHR(':'),
- CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
- CHR(':'), CHR(']'), CHR('_'), CHR(']')
-};
-static const chr backW[] = { /* \W */
- CHR('['), CHR('^'), CHR('['), CHR(':'),
- CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
- CHR(':'), CHR(']'), CHR('_'), CHR(']')
-};
-static const chr brbackw[] = { /* \w within brackets */
- CHR('['), CHR(':'),
- CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
- CHR(':'), CHR(']'), CHR('_')
-};
-
-/*
- * lexword - interpolate a bracket expression for word characters
- * Possibly ought to inquire whether there is a "word" character class.
- */
-static void
-lexword(struct vars *v)
-{
- lexnest(v, backw, ENDOF(backw));
-}
-
-/*
* next - get next token
*/
static int /* 1 normal, 0 failure */
@@ -292,14 +215,6 @@ next(struct vars *v)
RETV(SBEGIN, 0); /* same as \A */
}
- /* if we're nested and we've hit end, return to outer level */
- if (v->savenow != NULL && ATEOS())
- {
- v->now = v->savenow;
- v->stop = v->savestop;
- v->savenow = v->savestop = NULL;
- }
-
/* skip white space etc. if appropriate (not in literal or []) */
if (v->cflags & REG_EXPANDED)
switch (v->lexcon)
@@ -420,32 +335,15 @@ next(struct vars *v)
NOTE(REG_UNONPOSIX);
if (ATEOS())
FAILW(REG_EESCAPE);
- (DISCARD) lexescape(v);
+ if (!lexescape(v))
+ return 0;
switch (v->nexttype)
{ /* not all escapes okay here */
case PLAIN:
+ case CCLASSS:
+ case CCLASSC:
return 1;
break;
- case CCLASS:
- switch (v->nextvalue)
- {
- case 'd':
- lexnest(v, brbackd, ENDOF(brbackd));
- break;
- case 's':
- lexnest(v, brbacks, ENDOF(brbacks));
- break;
- case 'w':
- lexnest(v, brbackw, ENDOF(brbackw));
- break;
- default:
- FAILW(REG_EESCAPE);
- break;
- }
- /* lexnest done, back up and try again */
- v->nexttype = v->lasttype;
- return next(v);
- break;
}
/* not one of the acceptable escapes */
FAILW(REG_EESCAPE);
@@ -691,49 +589,17 @@ next(struct vars *v)
}
RETV(PLAIN, *v->now++);
}
- (DISCARD) lexescape(v);
- if (ISERR())
- FAILW(REG_EESCAPE);
- if (v->nexttype == CCLASS)
- { /* fudge at lexical level */
- switch (v->nextvalue)
- {
- case 'd':
- lexnest(v, backd, ENDOF(backd));
- break;
- case 'D':
- lexnest(v, backD, ENDOF(backD));
- break;
- case 's':
- lexnest(v, backs, ENDOF(backs));
- break;
- case 'S':
- lexnest(v, backS, ENDOF(backS));
- break;
- case 'w':
- lexnest(v, backw, ENDOF(backw));
- break;
- case 'W':
- lexnest(v, backW, ENDOF(backW));
- break;
- default:
- assert(NOTREACHED);
- FAILW(REG_ASSERT);
- break;
- }
- /* lexnest done, back up and try again */
- v->nexttype = v->lasttype;
- return next(v);
- }
- /* otherwise, lexescape has already done the work */
- return !ISERR();
+ return lexescape(v);
}
/*
* lexescape - parse an ARE backslash escape (backslash already eaten)
- * Note slightly nonstandard use of the CCLASS type code.
+ *
+ * This is used for ARE backslashes both normally and inside bracket
+ * expressions. In the latter case, not all escape types are allowed,
+ * but the caller must reject unwanted ones after we return.
*/
-static int /* not actually used, but convenient for RETV */
+static int
lexescape(struct vars *v)
{
chr c;
@@ -775,11 +641,11 @@ lexescape(struct vars *v)
break;
case CHR('d'):
NOTE(REG_ULOCALE);
- RETV(CCLASS, 'd');
+ RETV(CCLASSS, CC_DIGIT);
break;
case CHR('D'):
NOTE(REG_ULOCALE);
- RETV(CCLASS, 'D');
+ RETV(CCLASSC, CC_DIGIT);
break;
case CHR('e'):
NOTE(REG_UUNPORT);
@@ -802,11 +668,11 @@ lexescape(struct vars *v)
break;
case CHR('s'):
NOTE(REG_ULOCALE);
- RETV(CCLASS, 's');
+ RETV(CCLASSS, CC_SPACE);
break;
case CHR('S'):
NOTE(REG_ULOCALE);
- RETV(CCLASS, 'S');
+ RETV(CCLASSC, CC_SPACE);
break;
case CHR('t'):
RETV(PLAIN, CHR('\t'));
@@ -828,11 +694,11 @@ lexescape(struct vars *v)
break;
case CHR('w'):
NOTE(REG_ULOCALE);
- RETV(CCLASS, 'w');
+ RETV(CCLASSS, CC_WORD);
break;
case CHR('W'):
NOTE(REG_ULOCALE);
- RETV(CCLASS, 'W');
+ RETV(CCLASSC, CC_WORD);
break;
case CHR('x'):
NOTE(REG_UUNPORT);
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 047abc3e1e7..b5f3a73b1bb 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -350,17 +350,13 @@ static const struct cname
};
/*
- * The following arrays define the valid character class names.
+ * The following array defines the valid character class names.
+ * The entries must match enum char_classes in regguts.h.
*/
static const char *const classNames[NUM_CCLASSES + 1] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
- "lower", "print", "punct", "space", "upper", "xdigit", NULL
-};
-
-enum classes
-{
- CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
- CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+ "lower", "print", "punct", "space", "upper", "xdigit", "word",
+ NULL
};
/*
@@ -536,54 +532,58 @@ eclass(struct vars *v, /* context */
}
/*
- * cclass - supply cvec for a character class
- *
- * Must include case counterparts if "cases" is true.
+ * lookupcclass - lookup a character class identified by name
*
- * The returned cvec might be either a transient cvec gotten from getcvec(),
- * or a permanently cached one from pg_ctype_get_cache(). This is okay
- * because callers are not supposed to explicitly free the result either way.
+ * On failure, sets an error code in *v; the result is then garbage.
*/
-static struct cvec *
-cclass(struct vars *v, /* context */
- const chr *startp, /* where the name starts */
- const chr *endp, /* just past the end of the name */
- int cases) /* case-independent? */
+static enum char_classes
+lookupcclass(struct vars *v, /* context (for returning errors) */
+ const chr *startp, /* where the name starts */
+ const chr *endp) /* just past the end of the name */
{
size_t len;
- struct cvec *cv = NULL;
const char *const *namePtr;
- int i,
- index;
+ int i;
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
- index = -1;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
- {
- index = i;
- break;
- }
- }
- if (index == -1)
- {
- ERR(REG_ECTYPE);
- return NULL;
+ return (enum char_classes) i;
}
+ ERR(REG_ECTYPE);
+ return (enum char_classes) 0;
+}
+
+/*
+ * cclasscvec - supply cvec for a character class
+ *
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache(). This is okay
+ * because callers are not supposed to explicitly free the result either way.
+ */
+static struct cvec *
+cclasscvec(struct vars *v, /* context */
+ enum char_classes cclasscode, /* class to build a cvec for */
+ int cases) /* case-independent? */
+{
+ struct cvec *cv = NULL;
+
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
- ((enum classes) index == CC_LOWER ||
- (enum classes) index == CC_UPPER))
- index = (int) CC_ALPHA;
+ (cclasscode == CC_LOWER ||
+ cclasscode == CC_UPPER))
+ cclasscode = CC_ALPHA;
/*
* Now compute the character class contents. For classes that are based
@@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */
* NB: keep this code in sync with cclass_column_index(), below.
*/
- switch ((enum classes) index)
+ switch (cclasscode)
{
case CC_PRINT:
- cv = pg_ctype_get_cache(pg_wc_isprint, index);
+ cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
break;
case CC_ALNUM:
- cv = pg_ctype_get_cache(pg_wc_isalnum, index);
+ cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
break;
case CC_ALPHA:
- cv = pg_ctype_get_cache(pg_wc_isalpha, index);
+ cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
+ break;
+ case CC_WORD:
+ cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
break;
case CC_ASCII:
/* hard-wired meaning */
@@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
- cv = pg_ctype_get_cache(pg_wc_isdigit, index);
+ cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
break;
case CC_PUNCT:
- cv = pg_ctype_get_cache(pg_wc_ispunct, index);
+ cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
break;
case CC_XDIGIT:
@@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */
}
break;
case CC_SPACE:
- cv = pg_ctype_get_cache(pg_wc_isspace, index);
+ cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
break;
case CC_LOWER:
- cv = pg_ctype_get_cache(pg_wc_islower, index);
+ cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
break;
case CC_UPPER:
- cv = pg_ctype_get_cache(pg_wc_isupper, index);
+ cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
break;
case CC_GRAPH:
- cv = pg_ctype_get_cache(pg_wc_isgraph, index);
+ cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
break;
}
@@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)
/*
* Note: we should not see requests to consider cclasses that are not
- * treated as locale-specific by cclass(), above.
+ * treated as locale-specific by cclasscvec(), above.
*/
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
colnum |= cm->classbits[CC_PRINT];
@@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
colnum |= cm->classbits[CC_ALNUM];
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
colnum |= cm->classbits[CC_ALPHA];
+ if (cm->classbits[CC_WORD] && pg_wc_isword(c))
+ colnum |= cm->classbits[CC_WORD];
assert(cm->classbits[CC_ASCII] == 0);
assert(cm->classbits[CC_BLANK] == 0);
assert(cm->classbits[CC_CNTRL] == 0);
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 1fff3df1dae..bbbd61c604a 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -401,6 +401,15 @@ pg_wc_isalnum(pg_wchar c)
}
static int
+pg_wc_isword(pg_wchar c)
+{
+ /* We define word characters as alnum class plus underscore */
+ if (c == CHR('_'))
+ return 1;
+ return pg_wc_isalnum(c);
+}
+
+static int
pg_wc_isupper(pg_wchar c)
{
switch (pg_regex_strategy)
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 0cd4b4c4c29..7b77a29136c 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -46,13 +46,18 @@ static struct subre *parsebranch(struct vars *, int, int, struct state *, struct
static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
static void nonword(struct vars *, int, struct state *, struct state *);
static void word(struct vars *, int, struct state *, struct state *);
+static void charclass(struct vars *, enum char_classes,
+ struct state *, struct state *);
+static void charclasscomplement(struct vars *, enum char_classes,
+ struct state *, struct state *);
static int scannum(struct vars *);
static void repeat(struct vars *, struct state *, struct state *, int, int);
static void bracket(struct vars *, struct state *, struct state *);
static void cbracket(struct vars *, struct state *, struct state *);
-static void brackpart(struct vars *, struct state *, struct state *);
+static void brackpart(struct vars *, struct state *, struct state *, bool *);
static const chr *scanplain(struct vars *);
static void onechr(struct vars *, chr, struct state *, struct state *);
+static void optimizebracket(struct vars *, struct state *, struct state *);
static void wordchrs(struct vars *);
static void processlacon(struct vars *, struct state *, struct state *, int,
struct state *, struct state *);
@@ -81,8 +86,6 @@ static const char *stid(struct subre *, char *, size_t);
/* === regc_lex.c === */
static void lexstart(struct vars *);
static void prefixes(struct vars *);
-static void lexnest(struct vars *, const chr *, const chr *);
-static void lexword(struct vars *);
static int next(struct vars *);
static int lexescape(struct vars *);
static chr lexdigits(struct vars *, int, int, int);
@@ -206,6 +209,7 @@ static void freecvec(struct cvec *);
static int pg_wc_isdigit(pg_wchar c);
static int pg_wc_isalpha(pg_wchar c);
static int pg_wc_isalnum(pg_wchar c);
+static int pg_wc_isword(pg_wchar c);
static int pg_wc_isupper(pg_wchar c);
static int pg_wc_islower(pg_wchar c);
static int pg_wc_isgraph(pg_wchar c);
@@ -220,7 +224,8 @@ static chr element(struct vars *, const chr *, const chr *);
static struct cvec *range(struct vars *, chr, chr, int);
static int before(chr, chr);
static struct cvec *eclass(struct vars *, chr, int);
-static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
+static enum char_classes lookupcclass(struct vars *, const chr *, const chr *);
+static struct cvec *cclasscvec(struct vars *, enum char_classes, int);
static int cclass_column_index(struct colormap *, chr);
static struct cvec *allcases(struct vars *, chr);
static int cmp(const chr *, const chr *, size_t);
@@ -233,14 +238,12 @@ struct vars
regex_t *re;
const chr *now; /* scan pointer into string */
const chr *stop; /* end of string */
- const chr *savenow; /* saved now and stop for "subroutine call" */
- const chr *savestop;
int err; /* error code (0 if none) */
int cflags; /* copy of compile flags */
int lasttype; /* type of previous token */
int nexttype; /* type of next token */
chr nextvalue; /* value (if any) of next token */
- int lexcon; /* lexical context type (see lex.c) */
+ int lexcon; /* lexical context type (see regc_lex.c) */
int nsubexp; /* subexpression count */
struct subre **subs; /* subRE pointer vector */
size_t nsubs; /* length of vector */
@@ -287,6 +290,8 @@ struct vars
#define ECLASS 'E' /* start of [= */
#define CCLASS 'C' /* start of [: */
#define END 'X' /* end of [. [= [: */
+#define CCLASSS 's' /* char class shorthand escape */
+#define CCLASSC 'c' /* complement char class shorthand escape */
#define RANGE 'R' /* - within [] which might be range delim. */
#define LACON 'L' /* lookaround constraint subRE */
#define AHEAD 'a' /* color-lookahead arc */
@@ -356,7 +361,6 @@ pg_regcomp(regex_t *re,
v->re = re;
v->now = string;
v->stop = v->now + len;
- v->savenow = v->savestop = NULL;
v->err = 0;
v->cflags = flags;
v->nsubexp = 0;
@@ -835,23 +839,25 @@ parseqatom(struct vars *v,
return;
break;
case '<':
- wordchrs(v); /* does NEXT() */
+ wordchrs(v);
s = newstate(v->nfa);
NOERR();
nonword(v, BEHIND, lp, s);
word(v, AHEAD, s, rp);
+ NEXT();
return;
break;
case '>':
- wordchrs(v); /* does NEXT() */
+ wordchrs(v);
s = newstate(v->nfa);
NOERR();
word(v, BEHIND, lp, s);
nonword(v, AHEAD, s, rp);
+ NEXT();
return;
break;
case WBDRY:
- wordchrs(v); /* does NEXT() */
+ wordchrs(v);
s = newstate(v->nfa);
NOERR();
nonword(v, BEHIND, lp, s);
@@ -860,10 +866,11 @@ parseqatom(struct vars *v,
NOERR();
word(v, BEHIND, lp, s);
nonword(v, AHEAD, s, rp);
+ NEXT();
return;
break;
case NWBDRY:
- wordchrs(v); /* does NEXT() */
+ wordchrs(v);
s = newstate(v->nfa);
NOERR();
word(v, BEHIND, lp, s);
@@ -872,6 +879,7 @@ parseqatom(struct vars *v,
NOERR();
nonword(v, BEHIND, lp, s);
nonword(v, AHEAD, s, rp);
+ NEXT();
return;
break;
case LACON: /* lookaround constraint */
@@ -925,6 +933,16 @@ parseqatom(struct vars *v,
assert(SEE(']') || ISERR());
NEXT();
break;
+ case CCLASSS:
+ charclass(v, (enum char_classes) v->nextvalue, lp, rp);
+ okcolors(v->nfa, v->cm);
+ NEXT();
+ break;
+ case CCLASSC:
+ charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp);
+ /* charclasscomplement() did okcolors() internally */
+ NEXT();
+ break;
case '.':
rainbow(v->nfa, v->cm, PLAIN,
(v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
@@ -1339,6 +1357,75 @@ word(struct vars *v,
}
/*
+ * charclass - generate arcs for a character class
+ *
+ * This is used for both atoms (\w and sibling escapes) and for elements
+ * of bracket expressions. The caller is responsible for calling okcolors()
+ * at the end of processing the atom or bracket.
+ */
+static void
+charclass(struct vars *v,
+ enum char_classes cls,
+ struct state *lp,
+ struct state *rp)
+{
+ struct cvec *cv;
+
+ /* obtain possibly-cached cvec for char class */
+ NOTE(REG_ULOCALE);
+ cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
+ NOERR();
+
+ /* build the arcs; this may cause color splitting */
+ subcolorcvec(v, cv, lp, rp);
+}
+
+/*
+ * charclasscomplement - generate arcs for a complemented character class
+ *
+ * This is used for both atoms (\W and sibling escapes) and for elements
+ * of bracket expressions. In bracket expressions, it is the caller's
+ * responsibility that there not be any open subcolors when this is called.
+ */
+static void
+charclasscomplement(struct vars *v,
+ enum char_classes cls,
+ struct state *lp,
+ struct state *rp)
+{
+ struct state *cstate;
+ struct cvec *cv;
+
+ /* make dummy state to hang temporary arcs on */
+ cstate = newstate(v->nfa);
+ NOERR();
+
+ /* obtain possibly-cached cvec for char class */
+ NOTE(REG_ULOCALE);
+ cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
+ NOERR();
+
+ /* build arcs for char class; this may cause color splitting */
+ subcolorcvec(v, cv, cstate, cstate);
+
+ /* in NLSTOP mode, ensure newline is not part of the result set */
+ if (v->cflags & REG_NLSTOP)
+ newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate);
+ NOERR();
+
+ /* clean up any subcolors in the arc set */
+ okcolors(v->nfa, v->cm);
+ NOERR();
+
+ /* now build output arcs for the complement of the char class */
+ colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp);
+ NOERR();
+
+ /* clean up dummy state */
+ dropstate(v->nfa, cstate);
+}
+
+/*
* scannum - scan a number
*/
static int /* value, <= DUPMAX */
@@ -1456,6 +1543,7 @@ repeat(struct vars *v,
/*
* bracket - handle non-complemented bracket expression
+ *
* Also called from cbracket for complemented bracket expressions.
*/
static void
@@ -1463,16 +1551,52 @@ bracket(struct vars *v,
struct state *lp,
struct state *rp)
{
+ /*
+ * We can't process complemented char classes (e.g. \W) immediately while
+ * scanning the bracket expression, else color bookkeeping gets confused.
+ * Instead, remember whether we saw any in have_cclassc[], and process
+ * them at the end.
+ */
+ bool have_cclassc[NUM_CCLASSES];
+ bool any_cclassc;
+ int i;
+
+ memset(have_cclassc, false, sizeof(have_cclassc));
+
assert(SEE('['));
NEXT();
while (!SEE(']') && !SEE(EOS))
- brackpart(v, lp, rp);
+ brackpart(v, lp, rp, have_cclassc);
assert(SEE(']') || ISERR());
+
+ /* close up open subcolors from the positive bracket elements */
okcolors(v->nfa, v->cm);
+ NOERR();
+
+ /* now handle any complemented elements */
+ any_cclassc = false;
+ for (i = 0; i < NUM_CCLASSES; i++)
+ {
+ if (have_cclassc[i])
+ {
+ charclasscomplement(v, (enum char_classes) i, lp, rp);
+ NOERR();
+ any_cclassc = true;
+ }
+ }
+
+ /*
+ * If we had any complemented elements, see if we can optimize the bracket
+ * into a rainbow. Since a complemented element is the only way a WHITE
+ * arc could get into the result, there's no point in checking otherwise.
+ */
+ if (any_cclassc)
+ optimizebracket(v, lp, rp);
}
/*
* cbracket - handle complemented bracket expression
+ *
* We do it by calling bracket() with dummy endpoints, and then complementing
* the result. The alternative would be to invoke rainbow(), and then delete
* arcs as the b.e. is seen... but that gets messy, and is really quite
@@ -1496,7 +1620,9 @@ cbracket(struct vars *v,
/*
* Easy part of complementing, and all there is to do since the MCCE code
- * was removed.
+ * was removed. Note that the result of colorcomplement() cannot be a
+ * rainbow, since we don't allow empty brackets; so there's no point in
+ * calling optimizebracket() again.
*/
colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
NOERR();
@@ -1511,14 +1637,15 @@ cbracket(struct vars *v,
static void
brackpart(struct vars *v,
struct state *lp,
- struct state *rp)
+ struct state *rp,
+ bool *have_cclassc)
{
chr startc;
chr endc;
struct cvec *cv;
+ enum char_classes cls;
const chr *startp;
const chr *endp;
- chr c[1];
/* parse something, get rid of special cases, take shortcuts */
switch (v->nexttype)
@@ -1528,15 +1655,14 @@ brackpart(struct vars *v,
return;
break;
case PLAIN:
- c[0] = v->nextvalue;
+ startc = v->nextvalue;
NEXT();
/* shortcut for ordinary chr (not range) */
if (!SEE(RANGE))
{
- onechr(v, c[0], lp, rp);
+ onechr(v, startc, lp, rp);
return;
}
- startc = element(v, c, c + 1);
NOERR();
break;
case COLLEL:
@@ -1564,9 +1690,20 @@ brackpart(struct vars *v,
endp = scanplain(v);
INSIST(startp < endp, REG_ECTYPE);
NOERR();
- cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
+ cls = lookupcclass(v, startp, endp);
NOERR();
- subcolorcvec(v, cv, lp, rp);
+ charclass(v, cls, lp, rp);
+ return;
+ break;
+ case CCLASSS:
+ charclass(v, (enum char_classes) v->nextvalue, lp, rp);
+ NEXT();
+ return;
+ break;
+ case CCLASSC:
+ /* we cannot call charclasscomplement() immediately */
+ have_cclassc[v->nextvalue] = true;
+ NEXT();
return;
break;
default:
@@ -1582,9 +1719,8 @@ brackpart(struct vars *v,
{
case PLAIN:
case RANGE:
- c[0] = v->nextvalue;
+ endc = v->nextvalue;
NEXT();
- endc = element(v, c, c + 1);
NOERR();
break;
case COLLEL:
@@ -1618,7 +1754,7 @@ brackpart(struct vars *v,
/*
* scanplain - scan PLAIN contents of [. etc.
*
- * Certain bits of trickery in lex.c know that this code does not try
+ * Certain bits of trickery in regc_lex.c know that this code does not try
* to look past the final bracket of the [. etc.
*/
static const chr * /* just after end of sequence */
@@ -1665,38 +1801,97 @@ onechr(struct vars *v,
}
/*
+ * optimizebracket - see if bracket expression can be converted to RAINBOW
+ *
+ * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we
+ * can replace by a single RAINBOW arc for efficiency. (This might seem
+ * like a silly way to write ".", but it's seemingly a common locution in
+ * some other flavors of regex, so take the trouble to support it well.)
+ */
+static void
+optimizebracket(struct vars *v,
+ struct state *lp,
+ struct state *rp)
+{
+ struct colordesc *cd;
+ struct colordesc *end = CDEND(v->cm);
+ struct arc *a;
+ bool israinbow;
+
+ /*
+ * Scan lp's out-arcs and transiently mark the mentioned colors. We
+ * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp.
+ * (Note: there shouldn't be any pseudocolors yet, but check anyway.)
+ */
+ for (a = lp->outs; a != NULL; a = a->outchain)
+ {
+ assert(a->type == PLAIN);
+ assert(a->co >= 0); /* i.e. not RAINBOW */
+ assert(a->to == rp);
+ cd = &v->cm->cd[a->co];
+ assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO));
+ cd->flags |= COLMARK;
+ }
+
+ /* Scan colors, clear transient marks, check for unmarked live colors */
+ israinbow = true;
+ for (cd = v->cm->cd; cd < end; cd++)
+ {
+ if (cd->flags & COLMARK)
+ cd->flags &= ~COLMARK;
+ else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+ israinbow = false;
+ }
+
+ /* Can't do anything if not all colors have arcs */
+ if (!israinbow)
+ return;
+
+ /* OK, drop existing arcs and replace with a rainbow */
+ while ((a = lp->outs) != NULL)
+ freearc(v->nfa, a);
+ newarc(v->nfa, PLAIN, RAINBOW, lp, rp);
+}
+
+/*
* wordchrs - set up word-chr list for word-boundary stuff, if needed
*
- * The list is kept as a bunch of arcs between two dummy states; it's
- * disposed of by the unreachable-states sweep in NFA optimization.
- * Does NEXT(). Must not be called from any unusual lexical context.
- * This should be reconciled with the \w etc. handling in lex.c, and
- * should be cleaned up to reduce dependencies on input scanning.
+ * The list is kept as a bunch of circular arcs on an otherwise-unused state.
+ *
+ * Note that this must not be called while we have any open subcolors,
+ * else construction of the list would confuse color bookkeeping.
+ * Hence, we can't currently apply a similar optimization in
+ * charclass[complement](), as those need to be usable within bracket
+ * expressions.
*/
static void
wordchrs(struct vars *v)
{
- struct state *left;
- struct state *right;
+ struct state *cstate;
+ struct cvec *cv;
if (v->wordchrs != NULL)
- {
- NEXT(); /* for consistency */
- return;
- }
+ return; /* done already */
- left = newstate(v->nfa);
- right = newstate(v->nfa);
+ /* make dummy state to hang the cache arcs on */
+ cstate = newstate(v->nfa);
NOERR();
- /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
- lexword(v);
- NEXT();
- assert(v->savenow != NULL && SEE('['));
- bracket(v, left, right);
- assert((v->savenow != NULL && SEE(']')) || ISERR());
- NEXT();
+
+ /* obtain possibly-cached cvec for \w characters */
+ NOTE(REG_ULOCALE);
+ cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE));
NOERR();
- v->wordchrs = left;
+
+ /* build the arcs; this may cause color splitting */
+ subcolorcvec(v, cv, cstate, cstate);
+ NOERR();
+
+ /* close new open subcolors to ensure the cache entry is self-contained */
+ okcolors(v->nfa, v->cm);
+ NOERR();
+
+ /* success! save the cache pointer */
+ v->wordchrs = cstate;
}
/*
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 306525eb5fa..0e76a828f8f 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -128,6 +128,18 @@
/*
+ * known character classes
+ */
+enum char_classes
+{
+ CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
+ CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD
+};
+
+#define NUM_CCLASSES 14
+
+
+/*
* As soon as possible, we map chrs into equivalence classes -- "colors" --
* which are of much more manageable number.
*
@@ -164,12 +176,14 @@ struct colordesc
#define NOSUB COLORLESS /* value of "sub" when no open subcolor */
struct arc *arcs; /* chain of all arcs of this color */
chr firstchr; /* simple char first assigned to this color */
- int flags; /* bit values defined next */
+ int flags; /* bitmask of the following flags: */
#define FREECOL 01 /* currently free */
#define PSEUDO 02 /* pseudocolor, no real chars */
-#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
+#define COLMARK 04 /* temporary marker used in some functions */
};
+#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
+
/*
* The color map itself
*
@@ -199,8 +213,6 @@ struct colordesc
* appear in increasing chr-value order.
*/
-#define NUM_CCLASSES 13 /* must match data in regc_locale.c */
-
typedef struct colormaprange
{
chr cmin; /* range represents cmin..cmax inclusive */
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 21282789c27..92154b6d28a 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -1970,6 +1970,256 @@ select * from test_regex('a[\w]b', 'axb', 'LPE');
{axb}
(2 rows)
+-- these should be invalid
+select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
+ERROR: invalid regular expression: invalid character range
+select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
+ERROR: invalid regular expression: invalid character range
+select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
+ERROR: invalid regular expression: invalid character range
+select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
+ERROR: invalid regular expression: invalid character range
+-- test complemented char classes within brackets
+select * from test_regex('[\D]', '0123456789abc*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {a}
+(2 rows)
+
+select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {0}
+(2 rows)
+
+select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {1}
+(2 rows)
+
+select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {1}
+(2 rows)
+
+select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {2}
+(2 rows)
+
+select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {2}
+(2 rows)
+
+select * from test_regex('\W', '0123456789abc_*', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {*}
+(2 rows)
+
+select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {*}
+(2 rows)
+
+select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
+ test_regex
+--------------------------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
+ {"012 3456789abc_*"}
+(2 rows)
+
+-- check char classes' handling of newlines
+select * from test_regex('\s+', E'abc \n def', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {" +
+ "}
+(2 rows)
+
+select * from test_regex('\s+', E'abc \n def', 'nLP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {" +
+ "}
+(2 rows)
+
+select * from test_regex('[\s]+', E'abc \n def', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {" +
+ "}
+(2 rows)
+
+select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {" +
+ "}
+(2 rows)
+
+select * from test_regex('\S+', E'abc\ndef', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {abc}
+(2 rows)
+
+select * from test_regex('\S+', E'abc\ndef', 'nLP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {abc}
+(2 rows)
+
+select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {abc}
+(2 rows)
+
+select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {abc}
+(2 rows)
+
+select * from test_regex('\d+', E'012\n345', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {012}
+(2 rows)
+
+select * from test_regex('\d+', E'012\n345', 'nLP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {012}
+(2 rows)
+
+select * from test_regex('[\d]+', E'012\n345', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {012}
+(2 rows)
+
+select * from test_regex('[\d]+', E'012\n345', 'nLPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {012}
+(2 rows)
+
+select * from test_regex('\D+', E'abc\ndef345', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {"abc +
+ def"}
+(2 rows)
+
+select * from test_regex('\D+', E'abc\ndef345', 'nLP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {abc}
+(2 rows)
+
+select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {"abc +
+ def"}
+(2 rows)
+
+select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {abc}
+(2 rows)
+
+select * from test_regex('\w+', E'abc_012\ndef', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {abc_012}
+(2 rows)
+
+select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {abc_012}
+(2 rows)
+
+select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {abc_012}
+(2 rows)
+
+select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {abc_012}
+(2 rows)
+
+select * from test_regex('\W+', E'***\n@@@___', 'LP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {"*** +
+ @@@"}
+(2 rows)
+
+select * from test_regex('\W+', E'***\n@@@___', 'nLP');
+ test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {***}
+(2 rows)
+
+select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {"*** +
+ @@@"}
+(2 rows)
+
+select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
+ test_regex
+----------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
+ {***}
+(2 rows)
+
-- doing 13 "escapes"
-- expectError 13.1 & "a\\" EESCAPE
select * from test_regex('a\', '', '');
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 31e947ee9c6..b99329391e8 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -597,6 +597,50 @@ select * from test_regex('a[\s]b', 'a b', 'LPE');
-- expectMatch 12.18 LPE {a[\w]b} axb axb
select * from test_regex('a[\w]b', 'axb', 'LPE');
+-- these should be invalid
+select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
+select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
+select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
+select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
+
+-- test complemented char classes within brackets
+select * from test_regex('[\D]', '0123456789abc*', 'LPE');
+select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
+select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
+select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
+select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
+select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
+select * from test_regex('\W', '0123456789abc_*', 'LP');
+select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
+select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
+
+-- check char classes' handling of newlines
+select * from test_regex('\s+', E'abc \n def', 'LP');
+select * from test_regex('\s+', E'abc \n def', 'nLP');
+select * from test_regex('[\s]+', E'abc \n def', 'LPE');
+select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
+select * from test_regex('\S+', E'abc\ndef', 'LP');
+select * from test_regex('\S+', E'abc\ndef', 'nLP');
+select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
+select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
+select * from test_regex('\d+', E'012\n345', 'LP');
+select * from test_regex('\d+', E'012\n345', 'nLP');
+select * from test_regex('[\d]+', E'012\n345', 'LPE');
+select * from test_regex('[\d]+', E'012\n345', 'nLPE');
+select * from test_regex('\D+', E'abc\ndef345', 'LP');
+select * from test_regex('\D+', E'abc\ndef345', 'nLP');
+select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
+select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
+select * from test_regex('\w+', E'abc_012\ndef', 'LP');
+select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
+select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
+select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
+select * from test_regex('\W+', E'***\n@@@___', 'LP');
+select * from test_regex('\W+', E'***\n@@@___', 'nLP');
+select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
+select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
+
+
-- doing 13 "escapes"
-- expectError 13.1 & "a\\" EESCAPE