diff options
-rw-r--r-- | doc/src/sgml/func.sgml | 858 | ||||
-rw-r--r-- | src/backend/parser/gram.y | 28 | ||||
-rw-r--r-- | src/backend/utils/adt/regexp.c | 127 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.h | 15 | ||||
-rw-r--r-- | src/include/utils/builtins.h | 3 | ||||
-rw-r--r-- | src/test/regress/expected/strings.out | 18 | ||||
-rw-r--r-- | src/test/regress/sql/strings.sql | 15 |
7 files changed, 683 insertions, 381 deletions
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 544d070a85c..7b12f80fb78 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,5 +1,5 @@ <!-- -$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.124 2002/09/21 18:32:53 petere Exp $ +$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.125 2002/09/22 17:27:23 tgl Exp $ PostgreSQL documentation --> @@ -921,19 +921,32 @@ PostgreSQL documentation </row> <row> - <entry><function>substring</function>(<parameter>string</parameter> <optional>from <replaceable>pattern</replaceable></optional> <optional>for <replaceable>escape</replaceable></optional>)</entry> + <entry><function>substring</function>(<parameter>string</parameter> from <replaceable>pattern</replaceable>)</entry> <entry><type>text</type></entry> <entry> - extract regular expression + extract substring matching POSIX regular expression <indexterm> <primary>substring</primary> </indexterm> </entry> - <entry><literal>substring('Thomas' from 'mas$' for <optional>escape '\\'</optional>)</literal></entry> + <entry><literal>substring('Thomas' from '...$')</literal></entry> <entry><literal>mas</literal></entry> </row> <row> + <entry><function>substring</function>(<parameter>string</parameter> from <replaceable>pattern</replaceable> for <replaceable>escape</replaceable>)</entry> + <entry><type>text</type></entry> + <entry> + extract substring matching SQL99 regular expression + <indexterm> + <primary>substring</primary> + </indexterm> + </entry> + <entry><literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal></entry> + <entry><literal>oma</literal></entry> + </row> + + <row> <entry> <function>trim</function>(<optional>leading | trailing | both</optional> <optional><parameter>characters</parameter></optional> from @@ -960,6 +973,328 @@ PostgreSQL documentation </tgroup> </table> + <para> + Additional string manipulation functions are available and are + listed below. Some of them are used internally to implement the + <acronym>SQL</acronym>-standard string functions listed above. + </para> + + <table id="functions-string-other"> + <title>Other String Functions</title> + <tgroup cols="5"> + <thead> + <row> + <entry>Function</entry> + <entry>Return Type</entry> + <entry>Description</entry> + <entry>Example</entry> + <entry>Result</entry> + </row> + </thead> + + <tbody> + <row> + <entry><function>ascii</function>(<type>text</type>)</entry> + <entry>integer</entry> + <entry>Returns the <acronym>ASCII</acronym> code of the first character of the argument.</entry> + <entry><literal>ascii('x')</literal></entry> + <entry><literal>120</literal></entry> + </row> + + <row> + <entry><function>btrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>trim</parameter> <type>text</type>)</entry> + <entry><type>text</type></entry> + <entry> + Remove (trim) the longest string consisting only of characters + in <parameter>trim</parameter> from the start and end of + <parameter>string</parameter>. + </entry> + <entry><literal>btrim('xyxtrimyyx','xy')</literal></entry> + <entry><literal>trim</literal></entry> + </row> + + <row> + <entry><function>chr</function>(<type>integer</type>)</entry> + <entry><type>text</type></entry> + <entry>Returns the character with the given <acronym>ASCII</acronym> code.</entry> + <entry><literal>chr(65)</literal></entry> + <entry><literal>A</literal></entry> + </row> + + <row> + <entry> + <function>convert</function>(<parameter>string</parameter> + <type>text</type>, + <optional><parameter>src_encoding</parameter> <type>name</type>,</optional> + <parameter>dest_encoding</parameter> <type>name</type>) + </entry> + <entry><type>text</type></entry> + <entry> + Converts string using <parameter>dest_encoding</parameter>. + The original encoding is specified by + <parameter>src_encoding</parameter>. If + <parameter>src_encoding</parameter> is omitted, database + encoding is assumed. + </entry> + <entry><literal>convert('text_in_unicode', 'UNICODE', 'LATIN1')</literal></entry> + <entry><literal>text_in_unicode</literal> represented in ISO 8859-1</entry> + </row> + + <row> + <entry> + <function>decode</function>(<parameter>string</parameter> <type>text</type>, + <parameter>type</parameter> <type>text</type>) + </entry> + <entry><type>bytea</type></entry> + <entry> + Decodes binary data from <parameter>string</parameter> previously + encoded with encode(). Parameter type is same as in encode(). + </entry> + <entry><literal>decode('MTIzAAE=', 'base64')</literal></entry> + <entry><literal>123\000\001</literal></entry> + </row> + + <row> + <entry> + <function>encode</function>(<parameter>data</parameter> <type>bytea</type>, + <parameter>type</parameter> <type>text</type>) + </entry> + <entry><type>text</type></entry> + <entry> + Encodes binary data to <acronym>ASCII</acronym>-only representation. Supported + types are: 'base64', 'hex', 'escape'. + </entry> + <entry><literal>encode('123\\000\\001', 'base64')</literal></entry> + <entry><literal>MTIzAAE=</literal></entry> + </row> + + <row> + <entry><function>initcap</function>(<type>text</type>)</entry> + <entry><type>text</type></entry> + <entry>Converts first letter of each word (whitespace separated) to upper case.</entry> + <entry><literal>initcap('hi thomas')</literal></entry> + <entry><literal>Hi Thomas</literal></entry> + </row> + + <row> + <entry><function>length</function>(<parameter>string</parameter>)</entry> + <entry><type>integer</type></entry> + <entry> + length of string + <indexterm> + <primary>character strings</primary> + <secondary>length</secondary> + </indexterm> + <indexterm> + <primary>length</primary> + <secondary>character strings</secondary> + <see>character strings, length</see> + </indexterm> + </entry> + <entry><literal>length('jose')</literal></entry> + <entry><literal>4</literal></entry> + </row> + + <row> + <entry> + <function>lpad</function>(<parameter>string</parameter> <type>text</type>, + <parameter>length</parameter> <type>integer</type> + <optional>, <parameter>fill</parameter> <type>text</type></optional>) + </entry> + <entry>text</entry> + <entry> + Fills up the <parameter>string</parameter> to length + <parameter>length</parameter> by prepending the characters + <parameter>fill</parameter> (a space by default). If the + <parameter>string</parameter> is already longer than + <parameter>length</parameter> then it is truncated (on the + right). + </entry> + <entry><literal>lpad('hi', 5, 'xy')</literal></entry> + <entry><literal>xyxhi</literal></entry> + </row> + + <row> + <entry><function>ltrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>text</parameter> <type>text</type>)</entry> + <entry><type>text</type></entry> + <entry> + Removes the longest string containing only characters from + <parameter>trim</parameter> from the start of the string. + </entry> + <entry><literal>ltrim('zzzytrim','xyz')</literal></entry> + <entry><literal>trim</literal></entry> + </row> + + <row> + <entry><function>pg_client_encoding</function>()</entry> + <entry><type>name</type></entry> + <entry> + Returns current client encoding name. + </entry> + <entry><literal>pg_client_encoding()</literal></entry> + <entry><literal>SQL_ASCII</literal></entry> + </row> + + <row> + <entry><function>quote_ident</function>(<parameter>string</parameter> text)</entry> + <entry><type>text</type></entry> + <entry> + Returns the given string suitably quoted to be used as an identifier + in an SQL query string. + Quotes are added only if necessary (i.e., if the string contains + non-identifier characters or would be case-folded). + Embedded quotes are properly doubled. + </entry> + <entry><literal>quote_ident('Foo')</literal></entry> + <entry><literal>"Foo"</literal></entry> + </row> + + <row> + <entry><function>quote_literal</function>(<parameter>string</parameter> text)</entry> + <entry><type>text</type></entry> + <entry> + Returns the given string suitably quoted to be used as a literal + in an SQL query string. + Embedded quotes and backslashes are properly doubled. + </entry> + <entry><literal>quote_literal('O\'Reilly')</literal></entry> + <entry><literal>'O''Reilly'</literal></entry> + </row> + + <row> + <entry><function>repeat</function>(<type>text</type>, <type>integer</type>)</entry> + <entry><type>text</type></entry> + <entry>Repeat text a number of times.</entry> + <entry><literal>repeat('Pg', 4)</literal></entry> + <entry><literal>PgPgPgPg</literal></entry> + </row> + + <row> + <entry><function>replace</function>(<parameter>string</parameter> <type>text</type>, + <parameter>from</parameter> <type>text</type>, + <parameter>to</parameter> <type>text</type>)</entry> + <entry><type>text</type></entry> + <entry>Replace all occurrences in <parameter>string</parameter> of substring + <parameter>from</parameter> with substring <parameter>to</parameter> + </entry> + <entry><literal>replace('abcdefabcdef', 'cd', 'XX')</literal></entry> + <entry><literal>abXXefabXXef</literal></entry> + </row> + + <row> + <entry> + <function>rpad</function>(<parameter>string</parameter> <type>text</type>, + <parameter>length</parameter> <type>integer</type> + <optional>, <parameter>fill</parameter> <type>text</type></optional>) + </entry> + <entry><type>text</type></entry> + <entry> + Fills up the <parameter>string</parameter> to length + <parameter>length</parameter> by appending the characters + <parameter>fill</parameter> (a space by default). If the + <parameter>string</parameter> is already longer than + <parameter>length</parameter> then it is truncated. + </entry> + <entry><literal>rpad('hi', 5, 'xy')</literal></entry> + <entry><literal>hixyx</literal></entry> + </row> + + <row> + <entry><function>rtrim</function>(<parameter>string</parameter> + text, <parameter>trim</parameter> text)</entry> + <entry><type>text</type></entry> + <entry> + Removes the longest string containing only characters from + <parameter>trim</parameter> from the end of the string. + </entry> + <entry><literal>rtrim('trimxxxx','x')</literal></entry> + <entry><literal>trim</literal></entry> + </row> + + <row> + <entry><function>split_part</function>(<parameter>string</parameter> <type>text</type>, + <parameter>delimiter</parameter> <type>text</type>, + <parameter>column</parameter> <type>integer</type>)</entry> + <entry><type>text</type></entry> + <entry>Split <parameter>string</parameter> on <parameter>delimiter</parameter> + returning the resulting (one based) <parameter>column</parameter> number. + </entry> + <entry><literal>split_part('abc~@~def~@~ghi','~@~',2)</literal></entry> + <entry><literal>def</literal></entry> + </row> + + <row> + <entry><function>strpos</function>(<parameter>string</parameter>, <parameter>substring</parameter>)</entry> + <entry><type>text</type></entry> + <entry> + Locates specified substring. (same as + <literal>position(<parameter>substring</parameter> in + <parameter>string</parameter>)</literal>, but note the reversed + argument order) + </entry> + <entry><literal>strpos('high','ig')</literal></entry> + <entry><literal>2</literal></entry> + </row> + + <row> + <entry><function>substr</function>(<parameter>string</parameter>, <parameter>from</parameter> <optional>, <parameter>count</parameter></optional>)</entry> + <entry><type>text</type></entry> + <entry> + Extracts specified substring. (same as + <literal>substring(<parameter>string</parameter> from <parameter>from</parameter> for <parameter>count</parameter>)</literal>) + </entry> + <entry><literal>substr('alphabet', 3, 2)</literal></entry> + <entry><literal>ph</literal></entry> + </row> + + <row> + <entry><function>to_ascii</function>(<type>text</type> + <optional>, <parameter>encoding</parameter></optional>)</entry> + <entry><type>text</type></entry> + <entry>Converts text from multibyte encoding to <acronym>ASCII</acronym>.</entry> + <entry><literal>to_ascii('Karel')</literal></entry> + <entry><literal>Karel</literal></entry> + </row> + + <row> + <entry><function>to_hex</function>(<parameter>number</parameter> <type>integer</type> + or <type>bigint</type>)</entry> + <entry><type>text</type></entry> + <entry>Convert <parameter>number</parameter> to its equivalent hexadecimal + representation. + </entry> + <entry><literal>to_hex(9223372036854775807::bigint)</literal></entry> + <entry><literal>7fffffffffffffff</literal></entry> + </row> + + <row> + <entry> + <function>translate</function>(<parameter>string</parameter> + <type>text</type>, + <parameter>from</parameter> <type>text</type>, + <parameter>to</parameter> <type>text</type>) + </entry> + <entry><type>text</type></entry> + <entry> + Any character in <parameter>string</parameter> that matches a + character in the <parameter>from</parameter> set is replaced by + the corresponding character in the <parameter>to</parameter> + set. + </entry> + <entry><literal>translate('12345', '14', 'ax')</literal></entry> + <entry><literal>a23x5</literal></entry> + </row> + + </tbody> + </tgroup> + </table> + + <para> + The <function>to_ascii</function> function supports conversion from + LATIN1, LATIN2, WIN1250 (CP1250) only. + </para> + + <table id="conversion-names"> <title>Available conversion names</title> <tgroup cols="3"> @@ -1660,326 +1995,6 @@ PostgreSQL documentation </tgroup> </table> - <para> - Additional string manipulation functions are available and are - listed below. Some of them are used internally to implement the - <acronym>SQL</acronym>-standard string functions listed above. - </para> - - <table id="functions-string-other"> - <title>Other String Functions</title> - <tgroup cols="5"> - <thead> - <row> - <entry>Function</entry> - <entry>Return Type</entry> - <entry>Description</entry> - <entry>Example</entry> - <entry>Result</entry> - </row> - </thead> - - <tbody> - <row> - <entry><function>ascii</function>(<type>text</type>)</entry> - <entry>integer</entry> - <entry>Returns the <acronym>ASCII</acronym> code of the first character of the argument.</entry> - <entry><literal>ascii('x')</literal></entry> - <entry><literal>120</literal></entry> - </row> - - <row> - <entry><function>btrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>trim</parameter> <type>text</type>)</entry> - <entry><type>text</type></entry> - <entry> - Remove (trim) the longest string consisting only of characters - in <parameter>trim</parameter> from the start and end of - <parameter>string</parameter>. - </entry> - <entry><literal>btrim('xyxtrimyyx','xy')</literal></entry> - <entry><literal>trim</literal></entry> - </row> - - <row> - <entry><function>chr</function>(<type>integer</type>)</entry> - <entry><type>text</type></entry> - <entry>Returns the character with the given <acronym>ASCII</acronym> code.</entry> - <entry><literal>chr(65)</literal></entry> - <entry><literal>A</literal></entry> - </row> - - <row> - <entry> - <function>convert</function>(<parameter>string</parameter> - <type>text</type>, - <optional><parameter>src_encoding</parameter> <type>name</type>,</optional> - <parameter>dest_encoding</parameter> <type>name</type>) - </entry> - <entry><type>text</type></entry> - <entry> - Converts string using <parameter>dest_encoding</parameter>. - The original encoding is specified by - <parameter>src_encoding</parameter>. If - <parameter>src_encoding</parameter> is omitted, database - encoding is assumed. - </entry> - <entry><literal>convert('text_in_unicode', 'UNICODE', 'LATIN1')</literal></entry> - <entry><literal>text_in_unicode</literal> represented in ISO 8859-1</entry> - </row> - - <row> - <entry> - <function>decode</function>(<parameter>string</parameter> <type>text</type>, - <parameter>type</parameter> <type>text</type>) - </entry> - <entry><type>bytea</type></entry> - <entry> - Decodes binary data from <parameter>string</parameter> previously - encoded with encode(). Parameter type is same as in encode(). - </entry> - <entry><literal>decode('MTIzAAE=', 'base64')</literal></entry> - <entry><literal>123\000\001</literal></entry> - </row> - - <row> - <entry> - <function>encode</function>(<parameter>data</parameter> <type>bytea</type>, - <parameter>type</parameter> <type>text</type>) - </entry> - <entry><type>text</type></entry> - <entry> - Encodes binary data to <acronym>ASCII</acronym>-only representation. Supported - types are: 'base64', 'hex', 'escape'. - </entry> - <entry><literal>encode('123\\000\\001', 'base64')</literal></entry> - <entry><literal>MTIzAAE=</literal></entry> - </row> - - <row> - <entry><function>initcap</function>(<type>text</type>)</entry> - <entry><type>text</type></entry> - <entry>Converts first letter of each word (whitespace separated) to upper case.</entry> - <entry><literal>initcap('hi thomas')</literal></entry> - <entry><literal>Hi Thomas</literal></entry> - </row> - - <row> - <entry><function>length</function>(<parameter>string</parameter>)</entry> - <entry><type>integer</type></entry> - <entry> - length of string - <indexterm> - <primary>character strings</primary> - <secondary>length</secondary> - </indexterm> - <indexterm> - <primary>length</primary> - <secondary>character strings</secondary> - <see>character strings, length</see> - </indexterm> - </entry> - <entry><literal>length('jose')</literal></entry> - <entry><literal>4</literal></entry> - </row> - - <row> - <entry> - <function>lpad</function>(<parameter>string</parameter> <type>text</type>, - <parameter>length</parameter> <type>integer</type> - <optional>, <parameter>fill</parameter> <type>text</type></optional>) - </entry> - <entry>text</entry> - <entry> - Fills up the <parameter>string</parameter> to length - <parameter>length</parameter> by prepending the characters - <parameter>fill</parameter> (a space by default). If the - <parameter>string</parameter> is already longer than - <parameter>length</parameter> then it is truncated (on the - right). - </entry> - <entry><literal>lpad('hi', 5, 'xy')</literal></entry> - <entry><literal>xyxhi</literal></entry> - </row> - - <row> - <entry><function>ltrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>text</parameter> <type>text</type>)</entry> - <entry><type>text</type></entry> - <entry> - Removes the longest string containing only characters from - <parameter>trim</parameter> from the start of the string. - </entry> - <entry><literal>ltrim('zzzytrim','xyz')</literal></entry> - <entry><literal>trim</literal></entry> - </row> - - <row> - <entry><function>pg_client_encoding</function>()</entry> - <entry><type>name</type></entry> - <entry> - Returns current client encoding name. - </entry> - <entry><literal>pg_client_encoding()</literal></entry> - <entry><literal>SQL_ASCII</literal></entry> - </row> - - <row> - <entry><function>quote_ident</function>(<parameter>string</parameter> text)</entry> - <entry><type>text</type></entry> - <entry> - Returns the given string suitably quoted to be used as an identifier - in an SQL query string. - Quotes are added only if necessary (i.e., if the string contains - non-identifier characters or would be case-folded). - Embedded quotes are properly doubled. - </entry> - <entry><literal>quote_ident('Foo')</literal></entry> - <entry><literal>"Foo"</literal></entry> - </row> - - <row> - <entry><function>quote_literal</function>(<parameter>string</parameter> text)</entry> - <entry><type>text</type></entry> - <entry> - Returns the given string suitably quoted to be used as a literal - in an SQL query string. - Embedded quotes and backslashes are properly doubled. - </entry> - <entry><literal>quote_literal('O\'Reilly')</literal></entry> - <entry><literal>'O''Reilly'</literal></entry> - </row> - - <row> - <entry><function>repeat</function>(<type>text</type>, <type>integer</type>)</entry> - <entry><type>text</type></entry> - <entry>Repeat text a number of times.</entry> - <entry><literal>repeat('Pg', 4)</literal></entry> - <entry><literal>PgPgPgPg</literal></entry> - </row> - - <row> - <entry><function>replace</function>(<parameter>string</parameter> <type>text</type>, - <parameter>from</parameter> <type>text</type>, - <parameter>to</parameter> <type>text</type>)</entry> - <entry><type>text</type></entry> - <entry>Replace all occurrences in <parameter>string</parameter> of substring - <parameter>from</parameter> with substring <parameter>to</parameter> - </entry> - <entry><literal>replace('abcdefabcdef', 'cd', 'XX')</literal></entry> - <entry><literal>abXXefabXXef</literal></entry> - </row> - - <row> - <entry> - <function>rpad</function>(<parameter>string</parameter> <type>text</type>, - <parameter>length</parameter> <type>integer</type> - <optional>, <parameter>fill</parameter> <type>text</type></optional>) - </entry> - <entry><type>text</type></entry> - <entry> - Fills up the <parameter>string</parameter> to length - <parameter>length</parameter> by appending the characters - <parameter>fill</parameter> (a space by default). If the - <parameter>string</parameter> is already longer than - <parameter>length</parameter> then it is truncated. - </entry> - <entry><literal>rpad('hi', 5, 'xy')</literal></entry> - <entry><literal>hixyx</literal></entry> - </row> - - <row> - <entry><function>rtrim</function>(<parameter>string</parameter> - text, <parameter>trim</parameter> text)</entry> - <entry><type>text</type></entry> - <entry> - Removes the longest string containing only characters from - <parameter>trim</parameter> from the end of the string. - </entry> - <entry><literal>rtrim('trimxxxx','x')</literal></entry> - <entry><literal>trim</literal></entry> - </row> - - <row> - <entry><function>split_part</function>(<parameter>string</parameter> <type>text</type>, - <parameter>delimiter</parameter> <type>text</type>, - <parameter>column</parameter> <type>integer</type>)</entry> - <entry><type>text</type></entry> - <entry>Split <parameter>string</parameter> on <parameter>delimiter</parameter> - returning the resulting (one based) <parameter>column</parameter> number. - </entry> - <entry><literal>split_part('abc~@~def~@~ghi','~@~',2)</literal></entry> - <entry><literal>def</literal></entry> - </row> - - <row> - <entry><function>strpos</function>(<parameter>string</parameter>, <parameter>substring</parameter>)</entry> - <entry><type>text</type></entry> - <entry> - Locates specified substring. (same as - <literal>position(<parameter>substring</parameter> in - <parameter>string</parameter>)</literal>, but note the reversed - argument order) - </entry> - <entry><literal>strpos('high','ig')</literal></entry> - <entry><literal>2</literal></entry> - </row> - - <row> - <entry><function>substr</function>(<parameter>string</parameter>, <parameter>from</parameter> <optional>, <parameter>count</parameter></optional>)</entry> - <entry><type>text</type></entry> - <entry> - Extracts specified substring. (same as - <literal>substring(<parameter>string</parameter> from <parameter>from</parameter> for <parameter>count</parameter>)</literal>) - </entry> - <entry><literal>substr('alphabet', 3, 2)</literal></entry> - <entry><literal>ph</literal></entry> - </row> - - <row> - <entry><function>to_ascii</function>(<type>text</type> - <optional>, <parameter>encoding</parameter></optional>)</entry> - <entry><type>text</type></entry> - <entry>Converts text from multibyte encoding to <acronym>ASCII</acronym>.</entry> - <entry><literal>to_ascii('Karel')</literal></entry> - <entry><literal>Karel</literal></entry> - </row> - - <row> - <entry><function>to_hex</function>(<parameter>number</parameter> <type>integer</type> - or <type>bigint</type>)</entry> - <entry><type>text</type></entry> - <entry>Convert <parameter>number</parameter> to its equivalent hexadecimal - representation. - </entry> - <entry><literal>to_hex(9223372036854775807::bigint)</literal></entry> - <entry><literal>7fffffffffffffff</literal></entry> - </row> - - <row> - <entry> - <function>translate</function>(<parameter>string</parameter> - <type>text</type>, - <parameter>from</parameter> <type>text</type>, - <parameter>to</parameter> <type>text</type>) - </entry> - <entry><type>text</type></entry> - <entry> - Any character in <parameter>string</parameter> that matches a - character in the <parameter>from</parameter> set is replaced by - the corresponding character in the <parameter>to</parameter> - set. - </entry> - <entry><literal>translate('12345', '14', 'ax')</literal></entry> - <entry><literal>a23x5</literal></entry> - </row> - - </tbody> - </tgroup> - </table> - - <para> - The <function>to_ascii</function> function supports conversion from - LATIN1, LATIN2, WIN1250 (CP1250) only. - </para> </sect1> @@ -2171,16 +2186,16 @@ PostgreSQL documentation <title>Pattern Matching</title> <para> - There are two separate approaches to pattern matching provided by + There are three separate approaches to pattern matching provided by <productname>PostgreSQL</productname>: the traditional <acronym>SQL</acronym> - <function>LIKE</function> operator and the more recent + <function>LIKE</function> operator, the more recent <acronym>SQL99</acronym> - <function>SIMILAR TO</function> operator implementing + <function>SIMILAR TO</function> operator, and <acronym>POSIX</acronym>-style regular expressions. Additionally, a pattern matching function, - <function>SUBSTRING</function>, is available, as defined in - <acronym>SQL99</acronym>. + <function>SUBSTRING</function>, is available, using either + <acronym>SQL99</acronym>-style or POSIX-style regular expressions. </para> <tip> @@ -2190,13 +2205,6 @@ PostgreSQL documentation </para> </tip> - <para> - Both <function>LIKE</function> and <function>SIMILAR TO</function> - are SQL-standard operators which are also available in alternate - forms as <productname>PostgreSQL</productname> operators; look at - <literal>~</literal> and <literal>~~</literal> for examples. - </para> - <sect2 id="functions-like"> <title><function>LIKE</function></title> @@ -2296,11 +2304,142 @@ PostgreSQL documentation </sect2> - <sect2 id="functions-regexp"> - <title><function>SIMILAR TO</function> and <acronym>POSIX</acronym> + <sect2 id="functions-sql99-regexp"> + <title><function>SIMILAR TO</function> and <acronym>SQL99</acronym> Regular Expressions</title> - <indexterm zone="functions-regexp"> + <indexterm zone="functions-sql99-regexp"> + <primary>regular expressions</primary> + <seealso>pattern matching</seealso> + </indexterm> + + <indexterm> + <primary>similar to</primary> + </indexterm> + + <indexterm> + <primary>substring</primary> + </indexterm> + + <synopsis> +<replaceable>string</replaceable> SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional> +<replaceable>string</replaceable> NOT SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional> + </synopsis> + + <para> + The <function>SIMILAR TO</function> operator returns true or false + depending on whether its pattern matches the given string. It is + much like <function>LIKE</function>, except that it interprets the + pattern using <acronym>SQL99</acronym>'s definition of a regular + expression. + <acronym>SQL99</acronym>'s regular expressions are a curious cross + between <function>LIKE</function> notation and common regular expression + notation. + </para> + + <para> + Like <function>LIKE</function>, the <function>SIMILAR TO</function> + operator succeeds only if its pattern matches the entire string; + this is unlike common regular expression practice, wherein the pattern + may match any part of the string. + Also like + <function>LIKE</function>, <function>SIMILAR TO</function> uses + <literal>%</> and <literal>_</> as wildcard characters denoting + any string and any single character, respectively (these are + comparable to <literal>.*</> and <literal>.</> in POSIX regular + expressions). + </para> + + <para> + In addition to these facilities borrowed from <function>LIKE</function>, + <function>SIMILAR TO</function> supports these pattern-matching + metacharacters borrowed from POSIX regular expressions: + + <itemizedlist> + <listitem> + <para> + <literal>|</literal> denotes alternation (either of two alternatives). + </para> + </listitem> + <listitem> + <para> + <literal>*</literal> denotes repetition of the previous item zero + or more times. + </para> + </listitem> + <listitem> + <para> + <literal>+</literal> denotes repetition of the previous item one + or more times. + </para> + </listitem> + <listitem> + <para> + Parentheses <literal>()</literal> may be used to group items into + a single logical item. + </para> + </listitem> + <listitem> + <para> + A bracket expression <literal>[...]</literal> specifies a character + class, just as in POSIX regular expressions. + </para> + </listitem> + </itemizedlist> + + Notice that bounded repetition (<literal>?</> and <literal>{...}</>) + are not provided, though they exist in POSIX. Also, dot (<literal>.</>) + is not a metacharacter. + </para> + + <para> + As with <function>LIKE</>, a backslash disables the special meaning + of any of these metacharacters; or a different escape character can + be specified with <literal>ESCAPE</>. + </para> + + <informalexample> + <para> + Some examples: + <programlisting> +'abc' SIMILAR TO 'abc' <lineannotation>true</lineannotation> +'abc' SIMILAR TO 'a' <lineannotation>false</lineannotation> +'abc' SIMILAR TO '%(b|d)%' <lineannotation>true</lineannotation> +'abc' SIMILAR TO '(b|c)%' <lineannotation>false</lineannotation> + </programlisting> + </para> + </informalexample> + + <para> + The SUBSTRING function with three parameters, + <function>SUBSTRING</function>(<parameter>string</parameter> FROM + <replaceable>pattern</replaceable> FOR + <replaceable>escape</replaceable>), provides extraction of a substring + that matches a SQL99 regular expression pattern. As with SIMILAR TO, + the specified pattern must match to the entire data string, else the + function fails and returns NULL. To indicate the part of the pattern + that should be returned on success, SQL99 specifies that the pattern + must contain two occurrences of the escape character followed by + double quote (<literal>"</>). The text matching the portion of the + pattern between these markers is returned. + </para> + + <informalexample> + <para> + Some examples: + <programlisting> +SUBSTRING('foobar' FROM '%#"o_b#"%' FOR '#') <lineannotation>oob</lineannotation> +SUBSTRING('foobar' FROM '#"o_b#"%' FOR '#') <lineannotation>NULL</lineannotation> + </programlisting> + </para> + </informalexample> + + </sect2> + + <sect2 id="functions-posix-regexp"> + <title><acronym>POSIX</acronym> Regular Expressions</title> + + <indexterm zone="functions-posix-regexp"> <primary>regular expressions</primary> <seealso>pattern matching</seealso> </indexterm> @@ -2341,12 +2480,6 @@ PostgreSQL documentation <entry>Does not match regular expression, case insensitive</entry> <entry><literal>'thomas' !~* '.*vadim.*'</literal></entry> </row> - - <row> - <entry> <literal>SIMILAR TO</literal> </entry> - <entry>Matches regular expression, case sensitive</entry> - <entry><literal>'thomas' SIMILAR TO '.*thomas.*'</literal></entry> - </row> </tbody> </tgroup> </table> @@ -2354,7 +2487,8 @@ PostgreSQL documentation <para> <acronym>POSIX</acronym> regular expressions provide a more powerful means for - pattern matching than the <function>LIKE</function> function. + pattern matching than the <function>LIKE</function> and + <function>SIMILAR TO</> operators. Many Unix tools such as <command>egrep</command>, <command>sed</command>, or <command>awk</command> use a pattern matching language that is similar to the one described here. @@ -2379,10 +2513,34 @@ PostgreSQL documentation <para> Some examples: <programlisting> -'abc' SIMILAR TO 'abc' <lineannotation>true</lineannotation> -'abc' SIMILAR TO '^a' <lineannotation>true</lineannotation> -'abc' SIMILAR TO '(b|d)' <lineannotation>true</lineannotation> -'abc' SIMILAR TO '^(b|c)' <lineannotation>false</lineannotation> +'abc' ~ 'abc' <lineannotation>true</lineannotation> +'abc' ~ '^a' <lineannotation>true</lineannotation> +'abc' ~ '(b|d)' <lineannotation>true</lineannotation> +'abc' ~ '^(b|c)' <lineannotation>false</lineannotation> + </programlisting> + </para> + </informalexample> + + <para> + The SUBSTRING function with two parameters, + <function>SUBSTRING</function>(<parameter>string</parameter> FROM + <replaceable>pattern</replaceable>), provides extraction of a substring + that matches a POSIX regular expression pattern. It returns NULL if + there is no match, otherwise the portion of the text that matched the + pattern. But if the pattern contains any parentheses, the portion + of the text that matched the first parenthesized subexpression (the + one whose left parenthesis comes first) is + returned. You can always put parentheses around the whole expression + if you want to use parentheses within it without triggering this + exception. + </para> + + <informalexample> + <para> + Some examples: + <programlisting> +SUBSTRING('foobar' FROM 'o.b') <lineannotation>oob</lineannotation> +SUBSTRING('foobar' FROM 'o(.)b') <lineannotation>o</lineannotation> </programlisting> </para> </informalexample> diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 43597306d44..be45d7bde16 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.367 2002/09/18 21:35:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.368 2002/09/22 17:27:23 tgl Exp $ * * HISTORY * AUTHOR DATE MAJOR EVENT @@ -5644,22 +5644,40 @@ a_expr: c_expr { $$ = $1; } } | a_expr SIMILAR TO a_expr %prec SIMILAR - { $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, $4); } + { + A_Const *c = makeNode(A_Const); + FuncCall *n = makeNode(FuncCall); + c->val.type = T_Null; + n->funcname = SystemFuncName("similar_escape"); + n->args = makeList2($4, (Node *) c); + n->agg_star = FALSE; + n->agg_distinct = FALSE; + $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); + } | a_expr SIMILAR TO a_expr ESCAPE a_expr { FuncCall *n = makeNode(FuncCall); - n->funcname = SystemFuncName("like_escape"); + n->funcname = SystemFuncName("similar_escape"); n->args = makeList2($4, $6); n->agg_star = FALSE; n->agg_distinct = FALSE; $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); } | a_expr NOT SIMILAR TO a_expr %prec SIMILAR - { $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, $5); } + { + A_Const *c = makeNode(A_Const); + FuncCall *n = makeNode(FuncCall); + c->val.type = T_Null; + n->funcname = SystemFuncName("similar_escape"); + n->args = makeList2($5, (Node *) c); + n->agg_star = FALSE; + n->agg_distinct = FALSE; + $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, (Node *) n); + } | a_expr NOT SIMILAR TO a_expr ESCAPE a_expr { FuncCall *n = makeNode(FuncCall); - n->funcname = SystemFuncName("like_escape"); + n->funcname = SystemFuncName("similar_escape"); n->args = makeList2($5, $7); n->agg_star = FALSE; n->agg_distinct = FALSE; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index b64d6ede65a..ebbca8f0401 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.42 2002/09/04 20:31:28 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS) char *sterm; int len; bool match; - int nmatch = 1; - regmatch_t pmatch; + regmatch_t pmatch[2]; /* be sure sterm is null-terminated */ len = VARSIZE(s) - VARHDRSZ; @@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS) sterm[len] = '\0'; /* - * We need the match info back from the pattern match to be able to - * actually extract the substring. It seems to be adequate to pass in - * a structure to return only one result. + * We pass two regmatch_t structs to get info about the overall match + * and the match for the first parenthesized subexpression (if any). + * If there is a parenthesized subexpression, we return what it matched; + * else return what the whole regexp matched. */ - match = RE_compile_and_execute(p, sterm, REG_EXTENDED, nmatch, &pmatch); + match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch); + pfree(sterm); /* match? then return the substring matching the pattern */ if (match) { + int so, + eo; + + so = pmatch[1].rm_so; + eo = pmatch[1].rm_eo; + if (so < 0 || eo < 0) + { + /* no parenthesized subexpression */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + } + return (DirectFunctionCall3(text_substr, PointerGetDatum(s), - Int32GetDatum(pmatch.rm_so + 1), - Int32GetDatum(pmatch.rm_eo - pmatch.rm_so))); + Int32GetDatum(so + 1), + Int32GetDatum(eo - so))); } PG_RETURN_NULL(); } + +/* similar_escape() + * Convert a SQL99 regexp pattern to POSIX style, so it can be used by + * our regexp engine. + */ +Datum +similar_escape(PG_FUNCTION_ARGS) +{ + text *pat_text; + text *esc_text; + text *result; + unsigned char *p, + *e, + *r; + int plen, + elen; + bool afterescape = false; + int nquotes = 0; + + /* This function is not strict, so must test explicitly */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + pat_text = PG_GETARG_TEXT_P(0); + p = VARDATA(pat_text); + plen = (VARSIZE(pat_text) - VARHDRSZ); + if (PG_ARGISNULL(1)) + { + /* No ESCAPE clause provided; default to backslash as escape */ + e = "\\"; + elen = 1; + } + else + { + esc_text = PG_GETARG_TEXT_P(1); + e = VARDATA(esc_text); + elen = (VARSIZE(esc_text) - VARHDRSZ); + if (elen == 0) + e = NULL; /* no escape character */ + else if (elen != 1) + elog(ERROR, "ESCAPE string must be empty or one character"); + } + + /* We need room for ^, $, and up to 2 output bytes per input byte */ + result = (text *) palloc(VARHDRSZ + 2 + 2 * plen); + r = VARDATA(result); + + *r++ = '^'; + + while (plen > 0) + { + unsigned char pchar = *p; + + if (afterescape) + { + if (pchar == '"') /* for SUBSTRING patterns */ + *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; + else + { + *r++ = '\\'; + *r++ = pchar; + } + afterescape = false; + } + else if (e && pchar == *e) + { + /* SQL99 escape character; do not send to output */ + afterescape = true; + } + else if (pchar == '%') + { + *r++ = '.'; + *r++ = '*'; + } + else if (pchar == '_') + { + *r++ = '.'; + } + else if (pchar == '\\' || pchar == '.' || pchar == '?' || + pchar == '{') + { + *r++ = '\\'; + *r++ = pchar; + } + else + { + *r++ = pchar; + } + p++, plen--; + } + + *r++ = '$'; + + VARATT_SIZEP(result) = r - ((unsigned char *) result); + + PG_RETURN_TEXT_P(result); +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index eb44f283b91..369da463e8a 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: pg_proc.h,v 1.272 2002/09/18 21:35:23 tgl Exp $ + * $Id: pg_proc.h,v 1.273 2002/09/22 17:27:23 tgl Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2076,6 +2076,9 @@ DESCR("convert int4 to char"); DATA(insert OID = 1622 ( repeat PGNSP PGUID 12 f f t f i 2 25 "25 23" repeat - _null_ )); DESCR("replicate string int4 times"); +DATA(insert OID = 1623 ( similar_escape PGNSP PGUID 12 f f f f i 2 25 "25 25" similar_escape - _null_ )); +DESCR("convert SQL99 regexp pattern to POSIX style"); + DATA(insert OID = 1624 ( mul_d_interval PGNSP PGUID 12 f f t f i 2 1186 "701 1186" mul_d_interval - _null_ )); DATA(insert OID = 1633 ( texticlike PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ )); @@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive"); DATA(insert OID = 1636 ( nameicnlike PGNSP PGUID 12 f f t f i 2 16 "19 25" nameicnlike - _null_ )); DESCR("does not match LIKE expression, case-insensitive"); DATA(insert OID = 1637 ( like_escape PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ )); -DESCR("convert match pattern to use backslash escapes"); +DESCR("convert LIKE pattern to use backslash escapes"); DATA(insert OID = 1689 ( update_pg_pwd_and_pg_group PGNSP PGUID 12 f f t f v 0 2279 "" update_pg_pwd_and_pg_group - _null_ )); DESCR("update pg_pwd and pg_group files"); @@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression"); DATA(insert OID = 2008 ( notlike PGNSP PGUID 12 f f t f i 2 16 "17 17" byteanlike - _null_ )); DESCR("does not match LIKE expression"); DATA(insert OID = 2009 ( like_escape PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ )); -DESCR("convert match pattern to use backslash escapes"); +DESCR("convert LIKE pattern to use backslash escapes"); DATA(insert OID = 2010 ( length PGNSP PGUID 12 f f t f i 1 23 "17" byteaoctetlen - _null_ )); DESCR("octet length"); DATA(insert OID = 2011 ( byteacat PGNSP PGUID 12 f f t f i 2 17 "17 17" byteacat - _null_ )); @@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 ( date_mi_interval PGNSP PGUID 14 f f t f i 2 1114 "1082 DESCR("subtract"); DATA(insert OID = 2073 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 25" textregexsubstr - _null_ )); -DESCR("substitutes regular expression"); -DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, like_escape($2, $3))" - _null_ )); -DESCR("substitutes regular expression with escape argument"); +DESCR("extracts text matching regular expression"); +DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, similar_escape($2, $3))" - _null_ )); +DESCR("extracts text matching SQL99 regular expression"); DATA(insert OID = 2075 ( bit PGNSP PGUID 12 f f t f i 1 1560 "20" bitfromint8 - _null_ )); DESCR("int8 to bitstring"); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index bfa5fa8efc7..9f1c9ba8dbf 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: builtins.h,v 1.201 2002/09/19 22:48:34 tgl Exp $ + * $Id: builtins.h,v 1.202 2002/09/22 17:27:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS); extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS); +extern Datum similar_escape(PG_FUNCTION_ARGS); /* regproc.c */ extern Datum regprocin(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 576fafb7729..a73ca1aa84b 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; t (1 row) --- T581 regular expression substring -SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; +-- T581 regular expression substring (with SQL99's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; bcd ----- bcd (1 row) -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; True ------ t @@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; t (1 row) --- PostgreSQL extention to allow omitting the escape character -SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; +-- PostgreSQL extension to allow omitting the escape character; +-- here the regexp is taken as Posix syntax +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + cde +----- + cde +(1 row) + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; cde ----- cde diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index e5c15bc528f..c0a18959cd3 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890"; SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; --- T581 regular expression substring -SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; +-- T581 regular expression substring (with SQL99's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; -- Null inputs should return NULL SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True"; SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; --- PostgreSQL extention to allow omitting the escape character -SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; +-- PostgreSQL extension to allow omitting the escape character; +-- here the regexp is taken as Posix syntax +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; + -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; |