diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/utils/adt/regexp.c | 73 | ||||
-rw-r--r-- | src/test/regress/expected/strings.out | 59 | ||||
-rw-r--r-- | src/test/regress/sql/strings.sql | 26 |
3 files changed, 140 insertions, 18 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index da13a875eb0..ab44846e0b3 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -708,20 +708,42 @@ similar_escape(PG_FUNCTION_ARGS) * We surround the transformed input string with * ^(?: ... )$ * which requires some explanation. We need "^" and "$" to force - * the pattern to match the entire input string as per SQL99 spec. + * the pattern to match the entire input string as per the SQL spec. * The "(?:" and ")" are a non-capturing set of parens; we have to have * parens in case the string contains "|", else the "^" and "$" will * be bound into the first and last alternatives which is not what we * want, and the parens must be non capturing because we don't want them * to count when selecting output for SUBSTRING. + * + * When the pattern is divided into three parts by escape-double-quotes, + * what we emit is + * ^(?:part1){1,1}?(part2){1,1}(?:part3)$ + * which requires even more explanation. The "{1,1}?" on part1 makes it + * non-greedy so that it will match the smallest possible amount of text + * not the largest, as required by SQL. The plain parens around part2 + * are capturing parens so that that part is what controls the result of + * SUBSTRING. The "{1,1}" forces part2 to be greedy, so that it matches + * the largest possible amount of text; hence part3 must match the + * smallest amount of text, as required by SQL. We don't need an explicit + * greediness marker on part3. Note that this also confines the effects + * of any "|" characters to the respective part, which is what we want. + * + * The SQL spec says that SUBSTRING's pattern must contain exactly two + * escape-double-quotes, but we only complain if there's more than two. + * With none, we act as though part1 and part3 are empty; with one, we + * act as though part3 is empty. Both behaviors fall out of omitting + * the relevant part separators in the above expansion. If the result + * of this function is used in a plain regexp match (SIMILAR TO), the + * escape-double-quotes have no effect on the match behavior. *---------- */ /* - * We need room for the prefix/postfix plus as many as 3 output bytes per - * input byte; since the input is at most 1GB this can't overflow + * We need room for the prefix/postfix and part separators, plus as many + * as 3 output bytes per input byte; since the input is at most 1GB this + * can't overflow size_t. */ - result = (text *) palloc(VARHDRSZ + 6 + 3 * plen); + result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen); r = VARDATA(result); *r++ = '^'; @@ -760,7 +782,7 @@ similar_escape(PG_FUNCTION_ARGS) } else if (e && elen == mblen && memcmp(e, p, mblen) == 0) { - /* SQL99 escape character; do not send to output */ + /* SQL escape character; do not send to output */ afterescape = true; } else @@ -784,10 +806,45 @@ similar_escape(PG_FUNCTION_ARGS) /* fast path */ if (afterescape) { - if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */ - *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; + if (pchar == '"' && !incharclass) /* escape-double-quote? */ + { + /* emit appropriate part separator, per notes above */ + if (nquotes == 0) + { + *r++ = ')'; + *r++ = '{'; + *r++ = '1'; + *r++ = ','; + *r++ = '1'; + *r++ = '}'; + *r++ = '?'; + *r++ = '('; + } + else if (nquotes == 1) + { + *r++ = ')'; + *r++ = '{'; + *r++ = '1'; + *r++ = ','; + *r++ = '1'; + *r++ = '}'; + *r++ = '('; + *r++ = '?'; + *r++ = ':'; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER), + errmsg("SQL regular expression may not contain more than two escape-double-quote separators"))); + nquotes++; + } else { + /* + * We allow any character at all to be escaped; notably, this + * allows access to POSIX character-class escapes such as + * "\d". The SQL spec is considerably more restrictive. + */ *r++ = '\\'; *r++ = pchar; } @@ -795,7 +852,7 @@ similar_escape(PG_FUNCTION_ARGS) } else if (e && pchar == *e) { - /* SQL99 escape character; do not send to output */ + /* SQL escape character; do not send to output */ afterescape = true; } else if (incharclass) diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 7c7f8726fcc..24570bed163 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -313,7 +313,7 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; t (1 row) --- T581 regular expression substring (with SQL99's bizarre regexp syntax) +-- T581 regular expression substring (with SQL's bizarre regexp syntax) SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; bcd ----- @@ -328,13 +328,13 @@ SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; (1 row) -- Null inputs should return NULL -SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; True ------ t (1 row) -SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; True ------ t @@ -346,8 +346,57 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; t (1 row) --- PostgreSQL extension to allow omitting the escape character; --- here the regexp is taken as Posix syntax +-- The first and last parts should act non-greedy +SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; + abcdefg +--------- + abcdefg +(1 row) + +-- Vertical bar in any part affects only that part +SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; + bcdef +------- + bcdef +(1 row) + +-- Can't have more than two part separators +SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; +ERROR: SQL regular expression may not contain more than two escape-double-quote separators +CONTEXT: SQL function "substring" statement 1 +-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty +SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; + bcdefg +-------- + bcdefg +(1 row) + +SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; + abcdefg +--------- + abcdefg +(1 row) + +-- substring() with just two arguments is not allowed by SQL spec; +-- we accept it, but we interpret the pattern as a POSIX regexp not SQL SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; cde ----- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 9cd2bc5d7e8..5744c9f8007 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -110,19 +110,35 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890"; SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; --- T581 regular expression substring (with SQL99's bizarre regexp syntax) +-- T581 regular expression substring (with SQL's bizarre regexp syntax) SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; -- No match should return NULL SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; -- Null inputs should return NULL -SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True"; -SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; +SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; --- PostgreSQL extension to allow omitting the escape character; --- here the regexp is taken as Posix syntax +-- The first and last parts should act non-greedy +SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; + +-- Vertical bar in any part affects only that part +SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; + +-- Can't have more than two part separators +SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; + +-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty +SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; +SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; + +-- substring() with just two arguments is not allowed by SQL spec; +-- we accept it, but we interpret the pattern as a POSIX regexp not SQL SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; -- With a parenthesized subexpression, return only what matches the subexpr |