3 files changed, 140 insertions, 18 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index da13a875eb0..ab44846e0b3 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -708,20 +708,42 @@ similar_escape(PG_FUNCTION_ARGS)
 	 * We surround the transformed input string with
 	 *			^(?: ... )$
 	 * which requires some explanation.  We need "^" and "$" to force
-	 * the pattern to match the entire input string as per SQL99 spec.
+	 * the pattern to match the entire input string as per the SQL spec.
 	 * The "(?:" and ")" are a non-capturing set of parens; we have to have
 	 * parens in case the string contains "|", else the "^" and "$" will
 	 * be bound into the first and last alternatives which is not what we
 	 * want, and the parens must be non capturing because we don't want them
 	 * to count when selecting output for SUBSTRING.
+	 *
+	 * When the pattern is divided into three parts by escape-double-quotes,
+	 * what we emit is
+	 *			^(?:part1){1,1}?(part2){1,1}(?:part3)$
+	 * which requires even more explanation.  The "{1,1}?" on part1 makes it
+	 * non-greedy so that it will match the smallest possible amount of text
+	 * not the largest, as required by SQL.  The plain parens around part2
+	 * are capturing parens so that that part is what controls the result of
+	 * SUBSTRING.  The "{1,1}" forces part2 to be greedy, so that it matches
+	 * the largest possible amount of text; hence part3 must match the
+	 * smallest amount of text, as required by SQL.  We don't need an explicit
+	 * greediness marker on part3.  Note that this also confines the effects
+	 * of any "|" characters to the respective part, which is what we want.
+	 *
+	 * The SQL spec says that SUBSTRING's pattern must contain exactly two
+	 * escape-double-quotes, but we only complain if there's more than two.
+	 * With none, we act as though part1 and part3 are empty; with one, we
+	 * act as though part3 is empty.  Both behaviors fall out of omitting
+	 * the relevant part separators in the above expansion.  If the result
+	 * of this function is used in a plain regexp match (SIMILAR TO), the
+	 * escape-double-quotes have no effect on the match behavior.
 	 *----------
 	 */
 
 	/*
-	 * We need room for the prefix/postfix plus as many as 3 output bytes per
-	 * input byte; since the input is at most 1GB this can't overflow
+	 * We need room for the prefix/postfix and part separators, plus as many
+	 * as 3 output bytes per input byte; since the input is at most 1GB this
+	 * can't overflow size_t.
 	 */
-	result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
+	result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen);
 	r = VARDATA(result);
 
 	*r++ = '^';
@@ -760,7 +782,7 @@ similar_escape(PG_FUNCTION_ARGS)
 				}
 				else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
 				{
-					/* SQL99 escape character; do not send to output */
+					/* SQL escape character; do not send to output */
 					afterescape = true;
 				}
 				else
@@ -784,10 +806,45 @@ similar_escape(PG_FUNCTION_ARGS)
 		/* fast path */
 		if (afterescape)
 		{
-			if (pchar == '"' && !incharclass)	/* for SUBSTRING patterns */
-				*r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
+			if (pchar == '"' && !incharclass)	/* escape-double-quote? */
+			{
+				/* emit appropriate part separator, per notes above */
+				if (nquotes == 0)
+				{
+					*r++ = ')';
+					*r++ = '{';
+					*r++ = '1';
+					*r++ = ',';
+					*r++ = '1';
+					*r++ = '}';
+					*r++ = '?';
+					*r++ = '(';
+				}
+				else if (nquotes == 1)
+				{
+					*r++ = ')';
+					*r++ = '{';
+					*r++ = '1';
+					*r++ = ',';
+					*r++ = '1';
+					*r++ = '}';
+					*r++ = '(';
+					*r++ = '?';
+					*r++ = ':';
+				}
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
+							 errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
+				nquotes++;
+			}
 			else
 			{
+				/*
+				 * We allow any character at all to be escaped; notably, this
+				 * allows access to POSIX character-class escapes such as
+				 * "\d".  The SQL spec is considerably more restrictive.
+				 */
 				*r++ = '\\';
 				*r++ = pchar;
 			}
@@ -795,7 +852,7 @@ similar_escape(PG_FUNCTION_ARGS)
 		}
 		else if (e && pchar == *e)
 		{
-			/* SQL99 escape character; do not send to output */
+			/* SQL escape character; do not send to output */
 			afterescape = true;
 		}
 		else if (incharclass)
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 7c7f8726fcc..24570bed163 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -313,7 +313,7 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
  t
 (1 row)
 
--- T581 regular expression substring (with SQL99's bizarre regexp syntax)
+-- T581 regular expression substring (with SQL's bizarre regexp syntax)
 SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
  bcd 
 -----
@@ -328,13 +328,13 @@ SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
 (1 row)
 
 -- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
  True 
 ------
  t
 (1 row)
 
-SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
  True 
 ------
  t
@@ -346,8 +346,57 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
  t
 (1 row)
 
--- PostgreSQL extension to allow omitting the escape character;
--- here the regexp is taken as Posix syntax
+-- The first and last parts should act non-greedy
+SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+ abcdefg 
+---------
+ abcdefg
+(1 row)
+
+-- Vertical bar in any part affects only that part
+SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+ bcdef 
+-------
+ bcdef
+(1 row)
+
+-- Can't have more than two part separators
+SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+ERROR:  SQL regular expression may not contain more than two escape-double-quote separators
+CONTEXT:  SQL function "substring" statement 1
+-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
+SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+ bcdefg 
+--------
+ bcdefg
+(1 row)
+
+SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+ abcdefg 
+---------
+ abcdefg
+(1 row)
+
+-- substring() with just two arguments is not allowed by SQL spec;
+-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
 SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
  cde 
 -----
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 9cd2bc5d7e8..5744c9f8007 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -110,19 +110,35 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
 
 SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
 
--- T581 regular expression substring (with SQL99's bizarre regexp syntax)
+-- T581 regular expression substring (with SQL's bizarre regexp syntax)
 SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
 
 -- No match should return NULL
 SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
 
 -- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
-SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
+SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
 SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
 
--- PostgreSQL extension to allow omitting the escape character;
--- here the regexp is taken as Posix syntax
+-- The first and last parts should act non-greedy
+SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+
+-- Vertical bar in any part affects only that part
+SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+
+-- Can't have more than two part separators
+SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+
+-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
+SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+
+-- substring() with just two arguments is not allowed by SQL spec;
+-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
 SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
 
 -- With a parenthesized subexpression, return only what matches the subexpr