diff options
Diffstat (limited to 'doc/src/sgml')
-rw-r--r-- | doc/src/sgml/func.sgml | 8 | ||||
-rw-r--r-- | doc/src/sgml/textsearch.sgml | 200 |
2 files changed, 117 insertions, 91 deletions
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index afdda697205..368673c66e6 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.402 2007/10/21 20:04:37 tgl Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ --> <chapter id="functions"> <title>Functions and Operators</title> @@ -7857,11 +7857,11 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple </thead> <tbody> <row> - <entry><literal><function>ts_debug</function>(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>)</literal></entry> - <entry><type>setof ts_debug</type></entry> + <entry><literal><function>ts_debug</function>(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>, OUT <replaceable class="PARAMETER">token</> <type>text</>, OUT <replaceable class="PARAMETER">dictionaries</> <type>regdictionary[]</>, OUT <replaceable class="PARAMETER">dictionary</> <type>regdictionary</>, OUT <replaceable class="PARAMETER">lexemes</> <type>text[]</>)</literal></entry> + <entry><type>setof record</type></entry> <entry>test a configuration</entry> <entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry> - <entry><literal>(lword,"Latin word",The,{english_stem},"english_stem: {}") ...</literal></entry> + <entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry> </row> <row> <entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry> diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 03625b41a5b..81b54d8e174 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.22 2007/10/22 03:37:04 tgl Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.23 2007/10/22 20:13:37 tgl Exp $ --> <chapter id="textsearch"> <title id="textsearch-title">Full Text Search</title> @@ -1699,18 +1699,18 @@ ON messages FOR EACH ROW EXECUTE PROCEDURE messages_trigger(); <itemizedlist spacing="compact" mark="bullet"> <listitem> <para> - <structname>word</> <type>text</> — the value of a lexeme + <replaceable>word</> <type>text</> — the value of a lexeme </para> </listitem> <listitem> <para> - <structname>ndoc</> <type>integer</> — number of documents + <replaceable>ndoc</> <type>integer</> — number of documents (<type>tsvector</>s) the word occurred in </para> </listitem> <listitem> <para> - <structname>nentry</> <type>integer</> — total number of + <replaceable>nentry</> <type>integer</> — total number of occurrences of the word </para> </listitem> @@ -1901,8 +1901,8 @@ LIMIT 10; as the entire word and as each component: <programlisting> -SELECT "Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1'); - Alias | Description | Token +SELECT alias, description, token FROM ts_debug('foo-bar-beta1'); + alias | description | token -------------+-------------------------------+--------------- hword | Hyphenated word | foo-bar-beta1 lpart_hword | Latin part of hyphenated word | foo @@ -1917,8 +1917,8 @@ SELECT "Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1'); instructive example: <programlisting> -SELECT "Alias", "Description", "Token" FROM ts_debug('http://foo.com/stuff/index.html'); - Alias | Description | Token +SELECT alias, description, token FROM ts_debug('http://foo.com/stuff/index.html'); + alias | description | token ----------+---------------+-------------------------- protocol | Protocol head | http:// url | URL | foo.com/stuff/index.html @@ -2186,25 +2186,23 @@ SELECT ts_lexize('public.simple_dict','The'); synonym dictionary and put it before the <literal>english_stem</> dictionary: <programlisting> -SELECT * FROM ts_debug('english','Paris'); - Alias | Description | Token | Dictionaries | Lexized token --------+-------------+-------+----------------+---------------------- - lword | Latin word | Paris | {english_stem} | english_stem: {pari} -(1 row) +SELECT * FROM ts_debug('english', 'Paris'); + alias | description | token | dictionaries | dictionary | lexemes +-------+-------------+-------+----------------+--------------+--------- + lword | Latin word | Paris | {english_stem} | english_stem | {pari} -CREATE TEXT SEARCH DICTIONARY synonym ( +CREATE TEXT SEARCH DICTIONARY my_synonym ( TEMPLATE = synonym, SYNONYMS = my_synonyms ); ALTER TEXT SEARCH CONFIGURATION english - ALTER MAPPING FOR lword WITH synonym, english_stem; + ALTER MAPPING FOR lword WITH my_synonym, english_stem; -SELECT * FROM ts_debug('english','Paris'); - Alias | Description | Token | Dictionaries | Lexized token --------+-------------+-------+------------------------+------------------ - lword | Latin word | Paris | {synonym,english_stem} | synonym: {paris} -(1 row) +SELECT * FROM ts_debug('english', 'Paris'); + alias | description | token | dictionaries | dictionary | lexemes +-------+-------------+-------+---------------------------+------------+--------- + lword | Latin word | Paris | {my_synonym,english_stem} | my_synonym | {paris} </programlisting> </para> @@ -2711,7 +2709,14 @@ SHOW default_text_search_config; </indexterm> <synopsis> - ts_debug(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>) returns <type>setof ts_debug</> + ts_debug(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>, + OUT <replaceable class="PARAMETER">alias</> <type>text</>, + OUT <replaceable class="PARAMETER">description</> <type>text</>, + OUT <replaceable class="PARAMETER">token</> <type>text</>, + OUT <replaceable class="PARAMETER">dictionaries</> <type>regdictionary[]</>, + OUT <replaceable class="PARAMETER">dictionary</> <type>regdictionary</>, + OUT <replaceable class="PARAMETER">lexemes</> <type>text[]</>) + returns setof record </synopsis> <para> @@ -2725,23 +2730,47 @@ SHOW default_text_search_config; </para> <para> - <function>ts_debug</>'s result row type is defined as: + <function>ts_debug</> returns one row for each token identified in the text + by the parser. The columns returned are -<programlisting> -CREATE TYPE ts_debug AS ( - "Alias" text, - "Description" text, - "Token" text, - "Dictionaries" regdictionary[], - "Lexized token" text -); -</programlisting> - - One row is produced for each token identified by the parser. - The first three columns describe the token, and the fourth lists - the dictionaries selected by the configuration for that token's type. - The last column shows the result of dictionary processing: which - dictionary (if any) recognized the token, and what it produced. + <itemizedlist spacing="compact" mark="bullet"> + <listitem> + <para> + <replaceable>alias</> <type>text</> — short name of the token type + </para> + </listitem> + <listitem> + <para> + <replaceable>description</> <type>text</> — description of the + token type + </para> + </listitem> + <listitem> + <para> + <replaceable>token</> <type>text</> — text of the token + </para> + </listitem> + <listitem> + <para> + <replaceable>dictionaries</> <type>regdictionary[]</> — the + dictionaries selected by the configuration for this token type + </para> + </listitem> + <listitem> + <para> + <replaceable>dictionary</> <type>regdictionary</> — the dictionary + that recognized the token, or <literal>NULL</> if none did + </para> + </listitem> + <listitem> + <para> + <replaceable>lexemes</> <type>text[]</> — the lexeme(s) produced + by the dictionary that recognized the token, or <literal>NULL</> if + none did; an empty array (<literal>{}</>) means it was recognized as a + stop word + </para> + </listitem> + </itemizedlist> </para> <para> @@ -2749,33 +2778,32 @@ CREATE TYPE ts_debug AS ( <programlisting> SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); - Alias | Description | Token | Dictionaries | Lexized token --------+---------------+-------+--------------+---------------- - lword | Latin word | a | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | fat | {english} | english: {fat} - blank | Space symbols | | | - lword | Latin word | cat | {english} | english: {cat} - blank | Space symbols | | | - lword | Latin word | sat | {english} | english: {sat} - blank | Space symbols | | | - lword | Latin word | on | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | a | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | mat | {english} | english: {mat} - blank | Space symbols | | | - blank | Space symbols | - | | - lword | Latin word | it | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | ate | {english} | english: {ate} - blank | Space symbols | | | - lword | Latin word | a | {english} | english: {} - blank | Space symbols | | | - lword | Latin word | fat | {english} | english: {fat} - blank | Space symbols | | | - lword | Latin word | rats | {english} | english: {rat} - (24 rows) + alias | description | token | dictionaries | dictionary | lexemes +-------+---------------+-------+----------------+--------------+--------- + lword | Latin word | a | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | fat | {english_stem} | english_stem | {fat} + blank | Space symbols | | {} | | + lword | Latin word | cat | {english_stem} | english_stem | {cat} + blank | Space symbols | | {} | | + lword | Latin word | sat | {english_stem} | english_stem | {sat} + blank | Space symbols | | {} | | + lword | Latin word | on | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | a | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | mat | {english_stem} | english_stem | {mat} + blank | Space symbols | | {} | | + blank | Space symbols | - | {} | | + lword | Latin word | it | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | ate | {english_stem} | english_stem | {ate} + blank | Space symbols | | {} | | + lword | Latin word | a | {english_stem} | english_stem | {} + blank | Space symbols | | {} | | + lword | Latin word | fat | {english_stem} | english_stem | {fat} + blank | Space symbols | | {} | | + lword | Latin word | rats | {english_stem} | english_stem | {rat} </programlisting> </para> @@ -2801,34 +2829,33 @@ ALTER TEXT SEARCH CONFIGURATION public.english <programlisting> SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); - Alias | Description | Token | Dictionaries | Lexized token --------+---------------+-------------+-------------------------------------------------+------------------------------------- - lword | Latin word | The | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {} - blank | Space symbols | | | - lword | Latin word | Brightest | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {bright} - blank | Space symbols | | | - lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova} -(5 rows) + alias | description | token | dictionaries | dictionary | lexemes +-------+---------------+-------------+-------------------------------+----------------+------------- + lword | Latin word | The | {english_ispell,english_stem} | english_ispell | {} + blank | Space symbols | | {} | | + lword | Latin word | Brightest | {english_ispell,english_stem} | english_ispell | {bright} + blank | Space symbols | | {} | | + lword | Latin word | supernovaes | {english_ispell,english_stem} | english_stem | {supernova} </programlisting> <para> In this example, the word <literal>Brightest</> was recognized by the parser as a <literal>Latin word</literal> (alias <literal>lword</literal>). For this token type the dictionary list is - <literal>public.english_ispell</> and - <literal>pg_catalog.english_stem</literal>. The word was recognized by - <literal>public.english_ispell</literal>, which reduced it to the noun + <literal>english_ispell</> and + <literal>english_stem</literal>. The word was recognized by + <literal>english_ispell</literal>, which reduced it to the noun <literal>bright</literal>. The word <literal>supernovaes</literal> is - unknown to the <literal>public.english_ispell</literal> dictionary so it + unknown to the <literal>english_ispell</literal> dictionary so it was passed to the next dictionary, and, fortunately, was recognized (in - fact, <literal>public.english_stem</literal> is a Snowball dictionary which + fact, <literal>english_stem</literal> is a Snowball dictionary which recognizes everything; that is why it was placed at the end of the dictionary list). </para> <para> The word <literal>The</literal> was recognized by the - <literal>public.english_ispell</literal> dictionary as a stop word (<xref + <literal>english_ispell</literal> dictionary as a stop word (<xref linkend="textsearch-stopwords">) and will not be indexed. The spaces are discarded too, since the configuration provides no dictionaries at all for them. @@ -2839,16 +2866,15 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); you want to see: <programlisting> -SELECT "Alias", "Token", "Lexized token" +SELECT alias, token, dictionary, lexemes FROM ts_debug('public.english','The Brightest supernovaes'); - Alias | Token | Lexized token --------+-------------+-------------------------------------- - lword | The | public.english_ispell: {} - blank | | - lword | Brightest | public.english_ispell: {bright} - blank | | - lword | supernovaes | pg_catalog.english_stem: {supernova} -(5 rows) + alias | token | dictionary | lexemes +-------+-------------+----------------+------------- + lword | The | english_ispell | {} + blank | | | + lword | Brightest | english_ispell | {bright} + blank | | | + lword | supernovaes | english_stem | {supernova} </programlisting> </para> |