aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/regex/regc_lex.c5
-rw-r--r--src/backend/regex/regcomp.c68
-rw-r--r--src/backend/regex/regexec.c36
-rw-r--r--src/backend/utils/adt/jsonpath_gram.y8
-rw-r--r--src/backend/utils/adt/regexp.c12
-rw-r--r--src/include/regex/regex.h2
-rw-r--r--src/include/regex/regguts.h7
-rw-r--r--src/test/modules/test_regex/expected/test_regex.out42
-rw-r--r--src/test/modules/test_regex/sql/test_regex.sql13
9 files changed, 150 insertions, 43 deletions
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 7673dab76f4..45727ffa01f 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -528,10 +528,7 @@ next(struct vars *v)
}
assert(NOTREACHED);
}
- if (v->cflags & REG_NOSUB)
- RETV('(', 0); /* all parens non-capturing */
- else
- RETV('(', 1);
+ RETV('(', 1);
break;
case CHR(')'):
if (LASTTYPE('('))
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 60a220c57ab..ae3a7b6a38c 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
static void freesubre(struct vars *, struct subre *);
static void freesubreandsiblings(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
-static void optst(struct vars *, struct subre *);
+static void removecaptures(struct vars *, struct subre *);
static int numst(struct subre *, int);
static void markst(struct subre *);
static void cleanst(struct vars *);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
dumpst(v->tree, debug, 1);
}
#endif
- optst(v, v->tree);
+ if (v->cflags & REG_NOSUB)
+ removecaptures(v, v->tree);
v->ntree = numst(v->tree, 1);
markst(v->tree);
cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
break;
case BACKREF: /* the Feature From The Black Lagoon */
INSIST(type != LACON, REG_ESUBREG);
- INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
- INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+ subno = v->nextvalue;
+ assert(subno > 0);
+ INSIST(subno < v->nsubs, REG_ESUBREG);
+ NOERRN();
+ INSIST(v->subs[subno] != NULL, REG_ESUBREG);
NOERRN();
- assert(v->nextvalue > 0);
atom = subre(v, 'b', BACKR, lp, rp);
NOERRN();
- subno = v->nextvalue;
atom->backno = subno;
+ v->subs[subno]->flags |= BRUSE;
EMPTYARC(lp, rp); /* temporarily, so there's something */
NEXT();
break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v, /* might be NULL */
}
/*
- * optst - optimize a subRE subtree
+ * removecaptures - remove unnecessary capture subREs
+ *
+ * If the caller said that it doesn't care about subexpression match data,
+ * we may delete the "capture" markers on subREs that are not referenced
+ * by any backrefs, and then simplify anything that's become non-messy.
+ * Call this only if REG_NOSUB flag is set.
*/
static void
-optst(struct vars *v,
- struct subre *t)
+removecaptures(struct vars *v,
+ struct subre *t)
{
+ struct subre *t2;
+
+ assert(t != NULL);
+
+ /*
+ * If this isn't itself a backref target, clear capno and tentatively
+ * clear CAP flag.
+ */
+ if (!(t->flags & BRUSE))
+ {
+ t->capno = 0;
+ t->flags &= ~CAP;
+ }
+
+ /* Now recurse to children */
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ {
+ removecaptures(v, t2);
+ /* Propagate child CAP flag back up, if it's still set */
+ if (t2->flags & CAP)
+ t->flags |= CAP;
+ }
+
/*
- * DGP (2007-11-13): I assume it was the programmer's intent to eventually
- * come back and add code to optimize subRE trees, but the routine coded
- * just spends effort traversing the tree and doing nothing. We can do
- * nothing with less effort.
+ * If t now contains neither captures nor backrefs, there's no longer any
+ * need to care where its sub-match boundaries are, so we can reduce it to
+ * a simple DFA node. (Note in particular that MIXED child greediness is
+ * not a hindrance here, so we don't use the MESSY() macro.)
*/
- return;
+ if ((t->flags & (CAP | BACKR)) == 0)
+ {
+ if (t->child)
+ freesubreandsiblings(v, t->child);
+ t->child = NULL;
+ t->op = '=';
+ t->flags &= ~MIXED;
+ }
}
/*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
fprintf(f, " hascapture");
if (t->flags & BACKR)
fprintf(f, " hasbackref");
+ if (t->flags & BRUSE)
+ fprintf(f, " isreferenced");
if (!(t->flags & INUSE))
fprintf(f, " UNUSED");
if (t->latype != (char) -1)
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 5b9a0878203..db54ebfba40 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -213,12 +213,9 @@ pg_regexec(regex_t *re,
return REG_NOMATCH;
backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
v->eflags = flags;
- if (v->g->cflags & REG_NOSUB)
- nmatch = 0; /* override client */
- v->nmatch = nmatch;
- if (backref)
+ if (backref && nmatch <= v->g->nsub)
{
- /* need work area */
+ /* need larger work area */
if (v->g->nsub + 1 <= LOCALMAT)
v->pmatch = mat;
else
@@ -229,7 +226,17 @@ pg_regexec(regex_t *re,
v->nmatch = v->g->nsub + 1;
}
else
+ {
+ /* we can store results directly in caller's array */
v->pmatch = pmatch;
+ /* ensure any extra entries in caller's array are filled with -1 */
+ if (nmatch > 0)
+ zapallsubs(pmatch, nmatch);
+ /* then forget about extra entries, to avoid useless work in find() */
+ if (nmatch > v->g->nsub + 1)
+ nmatch = v->g->nsub + 1;
+ v->nmatch = nmatch;
+ }
v->details = details;
v->start = (chr *) string;
v->search_start = (chr *) string + search_start;
@@ -290,12 +297,20 @@ pg_regexec(regex_t *re,
else
st = find(v, &v->g->tree->cnfa, &v->g->cmap);
- /* copy (portion of) match vector over if necessary */
- if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
+ /* on success, ensure caller's match vector is filled correctly */
+ if (st == REG_OKAY && nmatch > 0)
{
- zapallsubs(pmatch, nmatch);
- n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
- memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
+ if (v->pmatch != pmatch)
+ {
+ /* copy portion of match vector over from (larger) work area */
+ assert(nmatch <= v->nmatch);
+ memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
+ }
+ if (v->g->cflags & REG_NOSUB)
+ {
+ /* don't expose possibly-partial sub-match results to caller */
+ zapallsubs(pmatch, nmatch);
+ }
}
/* clean up */
@@ -752,7 +767,6 @@ cdissect(struct vars *v,
break;
case '(': /* no-op capture node */
assert(t->child != NULL);
- assert(t->capno > 0);
er = cdissect(v, t->child, begin, end);
break;
default:
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index de3d97931ef..bd5d4488a06 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
}
+ /*
+ * We'll never need sub-match details at execution. While
+ * RE_compile_and_execute would set this flag anyway, force it on here to
+ * ensure that the regex cache entries created by makeItemLikeRegex are
+ * useful.
+ */
+ cflags |= REG_NOSUB;
+
return cflags;
}
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 484d4265fd8..268cee1cbed 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
{
regex_t *re;
+ /* Use REG_NOSUB if caller does not want sub-match details */
+ if (nmatch < 2)
+ cflags |= REG_NOSUB;
+
/* Compile RE */
re = RE_compile_and_cache(text_re, cflags, collation);
@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
int orig_len;
pg_wchar *wide_str;
int wide_len;
+ int cflags;
regex_t *cpattern;
regmatch_t *pmatch;
int pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
/* set up the compiled pattern */
- cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
+ cflags = re_flags->cflags;
+ if (!use_subpatterns)
+ cflags |= REG_NOSUB;
+ cpattern = RE_compile_and_cache(pattern, cflags, collation);
/* do we want to remember subpatterns? */
if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
if (case_insensitive)
cflags |= REG_ICASE;
- re = RE_compile_and_cache(text_re, cflags, collation);
+ re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
/* Examine it to see if there's a fixed prefix */
re_result = pg_regprefix(re, &str, &slen);
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index 2b48a19fb7d..0455ae8069e 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -106,7 +106,7 @@ typedef struct
#define REG_QUOTE 000004 /* no special characters, none */
#define REG_NOSPEC REG_QUOTE /* historical synonym */
#define REG_ICASE 000010 /* ignore case */
-#define REG_NOSUB 000020 /* don't care about subexpressions */
+#define REG_NOSUB 000020 /* caller doesn't need subexpr match data */
#define REG_EXPANDED 000040 /* expanded format, white space & comments */
#define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */
#define REG_NLANCH 000200 /* ^ matches after \n, $ before */
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 8db631c83bb..91a52840c47 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -477,13 +477,14 @@ struct subre
#define MIXED 04 /* mixed preference below */
#define CAP 010 /* capturing parens here or below */
#define BACKR 020 /* back reference here or below */
+#define BRUSE 040 /* is referenced by a back reference */
#define INUSE 0100 /* in use in final tree */
-#define NOPROP 03 /* bits which may not propagate up */
+#define UPPROP (MIXED|CAP|BACKR) /* flags which should propagate up */
#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
#define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */
-#define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
+#define UP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
#define MESSY(f) ((f)&(MIXED|CAP|BACKR))
-#define PREF(f) ((f)&NOPROP)
+#define PREF(f) ((f)&(LONGER|SHORTER))
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
char latype; /* LATYPE code, if lookaround constraint */
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 44da7d20190..5a6cdf47c2f 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-');
{ae,a}
(2 rows)
--- expectMatch 4.2 o (a)e ae
-select * from test_regex('(a)e', 'ae', 'o');
- test_regex
-------------
- {0}
- {NULL}
+-- expectMatch 4.2 oPR (.)\1e abeaae aae {}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
+ test_regex
+--------------------------------
+ {1,REG_UBACKREF,REG_UNONPOSIX}
+ {aae,NULL}
(2 rows)
-- expectMatch 4.3 b {\(a\)b} ab ab a
@@ -2658,6 +2658,20 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
{"abc abc",abc}
(2 rows)
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+ substring
+-----------
+ f
+(1 row)
+
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+ regexp_split_to_array
+-----------------------
+ {abc,def,ghi}
+(1 row)
+
-- doing 15 "octal escapes vs back references"
-- # initial zero is always octal
-- expectMatch 15.1 MP "a\\010b" "a\bb" "a\bb"
@@ -3476,6 +3490,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
{x,x,x,NULL}
(2 rows)
+-- expectMatch 21.37 RP ((.))(\2) xyy yy y y y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+ test_regex
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,y,y,y}
+(2 rows)
+
+-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+ test_regex
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,NULL,NULL,NULL}
+(2 rows)
+
-- doing 22 "multicharacter collating elements"
-- # again ugh
-- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 9224fdfdd3a..3419564203a 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');
-- expectMatch 4.1 - (a)e ae ae a
select * from test_regex('(a)e', 'ae', '-');
--- expectMatch 4.2 o (a)e ae
-select * from test_regex('(a)e', 'ae', 'o');
+-- expectMatch 4.2 oPR (.)\1e abeaae aae {}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
-- expectMatch 4.3 b {\(a\)b} ab ab a
select * from test_regex('\(a\)b', 'ab', 'b');
-- expectMatch 4.4 - a((b)c) abc abc bc b
@@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP');
select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+
-- doing 15 "octal escapes vs back references"
-- # initial zero is always octal
@@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
-- expectMatch 21.36 RPQ ((.))(\2){0} xy x x x {}
select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+-- expectMatch 21.37 RP ((.))(\2) xyy yy y y y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
-- doing 22 "multicharacter collating elements"
-- # again ugh