diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2002-09-22 17:27:25 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2002-09-22 17:27:25 +0000 |
commit | 9946b83dedb629b9eff1c063b9fbcaab25c209df (patch) | |
tree | b95a72601c503b1139154c1d95561c37b9dfabf8 /src | |
parent | ac355d558e27dd6b11b1d202de887a6d62d22ac6 (diff) | |
download | postgresql-9946b83dedb629b9eff1c063b9fbcaab25c209df.tar.gz postgresql-9946b83dedb629b9eff1c063b9fbcaab25c209df.zip |
Bring SIMILAR TO and SUBSTRING into some semblance of conformance with
the SQL99 standard. (I'm not sure that the character-class features are
quite right, but that can be fixed later.) Document SQL99 and POSIX
regexps as being different features; provide variants of SUBSTRING for
each.
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/parser/gram.y | 28 | ||||
-rw-r--r-- | src/backend/utils/adt/regexp.c | 127 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.h | 15 | ||||
-rw-r--r-- | src/include/utils/builtins.h | 3 | ||||
-rw-r--r-- | src/test/regress/expected/strings.out | 18 | ||||
-rw-r--r-- | src/test/regress/sql/strings.sql | 15 |
6 files changed, 175 insertions, 31 deletions
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 43597306d44..be45d7bde16 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.367 2002/09/18 21:35:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.368 2002/09/22 17:27:23 tgl Exp $ * * HISTORY * AUTHOR DATE MAJOR EVENT @@ -5644,22 +5644,40 @@ a_expr: c_expr { $$ = $1; } } | a_expr SIMILAR TO a_expr %prec SIMILAR - { $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, $4); } + { + A_Const *c = makeNode(A_Const); + FuncCall *n = makeNode(FuncCall); + c->val.type = T_Null; + n->funcname = SystemFuncName("similar_escape"); + n->args = makeList2($4, (Node *) c); + n->agg_star = FALSE; + n->agg_distinct = FALSE; + $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); + } | a_expr SIMILAR TO a_expr ESCAPE a_expr { FuncCall *n = makeNode(FuncCall); - n->funcname = SystemFuncName("like_escape"); + n->funcname = SystemFuncName("similar_escape"); n->args = makeList2($4, $6); n->agg_star = FALSE; n->agg_distinct = FALSE; $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); } | a_expr NOT SIMILAR TO a_expr %prec SIMILAR - { $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, $5); } + { + A_Const *c = makeNode(A_Const); + FuncCall *n = makeNode(FuncCall); + c->val.type = T_Null; + n->funcname = SystemFuncName("similar_escape"); + n->args = makeList2($5, (Node *) c); + n->agg_star = FALSE; + n->agg_distinct = FALSE; + $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, (Node *) n); + } | a_expr NOT SIMILAR TO a_expr ESCAPE a_expr { FuncCall *n = makeNode(FuncCall); - n->funcname = SystemFuncName("like_escape"); + n->funcname = SystemFuncName("similar_escape"); n->args = makeList2($5, $7); n->agg_star = FALSE; n->agg_distinct = FALSE; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index b64d6ede65a..ebbca8f0401 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.42 2002/09/04 20:31:28 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS) char *sterm; int len; bool match; - int nmatch = 1; - regmatch_t pmatch; + regmatch_t pmatch[2]; /* be sure sterm is null-terminated */ len = VARSIZE(s) - VARHDRSZ; @@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS) sterm[len] = '\0'; /* - * We need the match info back from the pattern match to be able to - * actually extract the substring. It seems to be adequate to pass in - * a structure to return only one result. + * We pass two regmatch_t structs to get info about the overall match + * and the match for the first parenthesized subexpression (if any). + * If there is a parenthesized subexpression, we return what it matched; + * else return what the whole regexp matched. */ - match = RE_compile_and_execute(p, sterm, REG_EXTENDED, nmatch, &pmatch); + match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch); + pfree(sterm); /* match? then return the substring matching the pattern */ if (match) { + int so, + eo; + + so = pmatch[1].rm_so; + eo = pmatch[1].rm_eo; + if (so < 0 || eo < 0) + { + /* no parenthesized subexpression */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + } + return (DirectFunctionCall3(text_substr, PointerGetDatum(s), - Int32GetDatum(pmatch.rm_so + 1), - Int32GetDatum(pmatch.rm_eo - pmatch.rm_so))); + Int32GetDatum(so + 1), + Int32GetDatum(eo - so))); } PG_RETURN_NULL(); } + +/* similar_escape() + * Convert a SQL99 regexp pattern to POSIX style, so it can be used by + * our regexp engine. + */ +Datum +similar_escape(PG_FUNCTION_ARGS) +{ + text *pat_text; + text *esc_text; + text *result; + unsigned char *p, + *e, + *r; + int plen, + elen; + bool afterescape = false; + int nquotes = 0; + + /* This function is not strict, so must test explicitly */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + pat_text = PG_GETARG_TEXT_P(0); + p = VARDATA(pat_text); + plen = (VARSIZE(pat_text) - VARHDRSZ); + if (PG_ARGISNULL(1)) + { + /* No ESCAPE clause provided; default to backslash as escape */ + e = "\\"; + elen = 1; + } + else + { + esc_text = PG_GETARG_TEXT_P(1); + e = VARDATA(esc_text); + elen = (VARSIZE(esc_text) - VARHDRSZ); + if (elen == 0) + e = NULL; /* no escape character */ + else if (elen != 1) + elog(ERROR, "ESCAPE string must be empty or one character"); + } + + /* We need room for ^, $, and up to 2 output bytes per input byte */ + result = (text *) palloc(VARHDRSZ + 2 + 2 * plen); + r = VARDATA(result); + + *r++ = '^'; + + while (plen > 0) + { + unsigned char pchar = *p; + + if (afterescape) + { + if (pchar == '"') /* for SUBSTRING patterns */ + *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; + else + { + *r++ = '\\'; + *r++ = pchar; + } + afterescape = false; + } + else if (e && pchar == *e) + { + /* SQL99 escape character; do not send to output */ + afterescape = true; + } + else if (pchar == '%') + { + *r++ = '.'; + *r++ = '*'; + } + else if (pchar == '_') + { + *r++ = '.'; + } + else if (pchar == '\\' || pchar == '.' || pchar == '?' || + pchar == '{') + { + *r++ = '\\'; + *r++ = pchar; + } + else + { + *r++ = pchar; + } + p++, plen--; + } + + *r++ = '$'; + + VARATT_SIZEP(result) = r - ((unsigned char *) result); + + PG_RETURN_TEXT_P(result); +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index eb44f283b91..369da463e8a 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: pg_proc.h,v 1.272 2002/09/18 21:35:23 tgl Exp $ + * $Id: pg_proc.h,v 1.273 2002/09/22 17:27:23 tgl Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2076,6 +2076,9 @@ DESCR("convert int4 to char"); DATA(insert OID = 1622 ( repeat PGNSP PGUID 12 f f t f i 2 25 "25 23" repeat - _null_ )); DESCR("replicate string int4 times"); +DATA(insert OID = 1623 ( similar_escape PGNSP PGUID 12 f f f f i 2 25 "25 25" similar_escape - _null_ )); +DESCR("convert SQL99 regexp pattern to POSIX style"); + DATA(insert OID = 1624 ( mul_d_interval PGNSP PGUID 12 f f t f i 2 1186 "701 1186" mul_d_interval - _null_ )); DATA(insert OID = 1633 ( texticlike PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ )); @@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive"); DATA(insert OID = 1636 ( nameicnlike PGNSP PGUID 12 f f t f i 2 16 "19 25" nameicnlike - _null_ )); DESCR("does not match LIKE expression, case-insensitive"); DATA(insert OID = 1637 ( like_escape PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ )); -DESCR("convert match pattern to use backslash escapes"); +DESCR("convert LIKE pattern to use backslash escapes"); DATA(insert OID = 1689 ( update_pg_pwd_and_pg_group PGNSP PGUID 12 f f t f v 0 2279 "" update_pg_pwd_and_pg_group - _null_ )); DESCR("update pg_pwd and pg_group files"); @@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression"); DATA(insert OID = 2008 ( notlike PGNSP PGUID 12 f f t f i 2 16 "17 17" byteanlike - _null_ )); DESCR("does not match LIKE expression"); DATA(insert OID = 2009 ( like_escape PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ )); -DESCR("convert match pattern to use backslash escapes"); +DESCR("convert LIKE pattern to use backslash escapes"); DATA(insert OID = 2010 ( length PGNSP PGUID 12 f f t f i 1 23 "17" byteaoctetlen - _null_ )); DESCR("octet length"); DATA(insert OID = 2011 ( byteacat PGNSP PGUID 12 f f t f i 2 17 "17 17" byteacat - _null_ )); @@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 ( date_mi_interval PGNSP PGUID 14 f f t f i 2 1114 "1082 DESCR("subtract"); DATA(insert OID = 2073 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 25" textregexsubstr - _null_ )); -DESCR("substitutes regular expression"); -DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, like_escape($2, $3))" - _null_ )); -DESCR("substitutes regular expression with escape argument"); +DESCR("extracts text matching regular expression"); +DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, similar_escape($2, $3))" - _null_ )); +DESCR("extracts text matching SQL99 regular expression"); DATA(insert OID = 2075 ( bit PGNSP PGUID 12 f f t f i 1 1560 "20" bitfromint8 - _null_ )); DESCR("int8 to bitstring"); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index bfa5fa8efc7..9f1c9ba8dbf 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: builtins.h,v 1.201 2002/09/19 22:48:34 tgl Exp $ + * $Id: builtins.h,v 1.202 2002/09/22 17:27:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS); extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS); +extern Datum similar_escape(PG_FUNCTION_ARGS); /* regproc.c */ extern Datum regprocin(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 576fafb7729..a73ca1aa84b 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; t (1 row) --- T581 regular expression substring -SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; +-- T581 regular expression substring (with SQL99's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; bcd ----- bcd (1 row) -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; True ------ t @@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; t (1 row) --- PostgreSQL extention to allow omitting the escape character -SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; +-- PostgreSQL extension to allow omitting the escape character; +-- here the regexp is taken as Posix syntax +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + cde +----- + cde +(1 row) + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; cde ----- cde diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index e5c15bc528f..c0a18959cd3 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890"; SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; --- T581 regular expression substring -SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; +-- T581 regular expression substring (with SQL99's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; -- Null inputs should return NULL SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True"; SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; --- PostgreSQL extention to allow omitting the escape character -SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; +-- PostgreSQL extension to allow omitting the escape character; +-- here the regexp is taken as Posix syntax +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; + -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; |