aboutsummaryrefslogtreecommitdiff
path: root/doc/src
diff options
context:
space:
mode:
authorSimon Riggs <simon@2ndQuadrant.com>2015-05-15 14:37:10 -0400
committerSimon Riggs <simon@2ndQuadrant.com>2015-05-15 14:37:10 -0400
commitf6d208d6e51810c73f0e02c477984a6b44627f11 (patch)
tree99d540d0b7bda73ff60479f15444f554403d4679 /doc/src
parent11a83bbedd73800db70f6f2af5a8eb10d15d39d7 (diff)
downloadpostgresql-f6d208d6e51810c73f0e02c477984a6b44627f11.tar.gz
postgresql-f6d208d6e51810c73f0e02c477984a6b44627f11.zip
TABLESAMPLE, SQL Standard and extensible
Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs
Diffstat (limited to 'doc/src')
-rw-r--r--doc/src/sgml/catalogs.sgml120
-rw-r--r--doc/src/sgml/ref/select.sgml61
2 files changed, 180 insertions, 1 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 6c51974961b..5b364876093 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -279,6 +279,11 @@
</row>
<row>
+ <entry><link linkend="catalog-pg-tablesample-method"><structname>pg_tablesample_method</structname></link></entry>
+ <entry>table sampling methods</entry>
+ </row>
+
+ <row>
<entry><link linkend="catalog-pg-tablespace"><structname>pg_tablespace</structname></link></entry>
<entry>tablespaces within this database cluster</entry>
</row>
@@ -6127,6 +6132,121 @@
</sect1>
+ <sect1 id="catalog-pg-tablesample-method">
+ <title><structname>pg_tabesample_method</structname></title>
+
+ <indexterm zone="catalog-pg-tablesample-method">
+ <primary>pg_am</primary>
+ </indexterm>
+
+ <para>
+ The catalog <structname>pg_tablesample_method</structname> stores
+ information about table sampling methods which can be used in
+ <command>TABLESAMPLE</command> clause of a <command>SELECT</command>
+ statement.
+ </para>
+
+ <table>
+ <title><structname>pg_tablesample_method</> Columns</title>
+
+ <tgroup cols="4">
+ <thead>
+ <row>
+ <entry>Name</entry>
+ <entry>Type</entry>
+ <entry>References</entry>
+ <entry>Description</entry>
+ </row>
+ </thead>
+ <tbody>
+
+ <row>
+ <entry><structfield>oid</structfield></entry>
+ <entry><type>oid</type></entry>
+ <entry></entry>
+ <entry>Row identifier (hidden attribute; must be explicitly selected)</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmname</structfield></entry>
+ <entry><type>name</type></entry>
+ <entry></entry>
+ <entry>Name of the sampling method</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmseqscan</structfield></entry>
+ <entry><type>bool</type></entry>
+ <entry></entry>
+ <entry>If true, the sampling method scans the whole table sequentially.
+ </entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmpagemode</structfield></entry>
+ <entry><type>bool</type></entry>
+ <entry></entry>
+ <entry>If true, the sampling method always reads the pages completely.
+ </entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsminit</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry><quote>Initialize the sampling scan</quote> function</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmnextblock</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry><quote>Get next block number</quote> function</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmnexttuple</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry><quote>Get next tuple offset</quote> function</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmexaminetuple</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry>Function which examines the tuple contents and decides if to
+ return it, or zero if none</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmend</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry><quote>End the sampling scan</quote> function</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmreset</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry><quote>Restart the state of sampling scan</quote> function</entry>
+ </row>
+
+ <row>
+ <entry><structfield>tsmcost</structfield></entry>
+ <entry><type>regproc</type></entry>
+ <entry><literal><link linkend="catalog-pg-proc"><structname>pg_proc</structname></link>.oid</literal></entry>
+ <entry>Costing function</entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect1>
+
+
<sect1 id="catalog-pg-tablespace">
<title><structname>pg_tablespace</structname></title>
diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml
index 2295f63c130..42e04660a19 100644
--- a/doc/src/sgml/ref/select.sgml
+++ b/doc/src/sgml/ref/select.sgml
@@ -49,7 +49,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
<phrase>where <replaceable class="parameter">from_item</replaceable> can be one of:</phrase>
- [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ] [ [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ] ]
+ [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ] [ [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ] ] [ TABLESAMPLE <replaceable class="parameter">sampling_method</replaceable> ( <replaceable class="parameter">argument</replaceable> [, ...] ) [ REPEATABLE ( <replaceable class="parameter">seed</replaceable> ) ] ]
[ LATERAL ] ( <replaceable class="parameter">select</replaceable> ) [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ]
<replaceable class="parameter">with_query_name</replaceable> [ [ AS ] <replaceable class="parameter">alias</replaceable> [ ( <replaceable class="parameter">column_alias</replaceable> [, ...] ) ] ]
[ LATERAL ] <replaceable class="parameter">function_name</replaceable> ( [ <replaceable class="parameter">argument</replaceable> [, ...] ] )
@@ -317,6 +317,50 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
</varlistentry>
<varlistentry>
+ <term>TABLESAMPLE <replaceable class="parameter">sampling_method</replaceable> ( <replaceable class="parameter">argument</replaceable> [, ...] ) [ REPEATABLE ( <replaceable class="parameter">seed</replaceable> ) ]</term>
+ <listitem>
+ <para>
+ Table sample clause after
+ <replaceable class="parameter">table_name</replaceable> indicates that
+ a <replaceable class="parameter">sampling_method</replaceable> should
+ be used to retrieve subset of rows in the table.
+ The <replaceable class="parameter">sampling_method</replaceable> can be
+ any sampling method installed in the database. There are currently two
+ sampling methods available in the standard
+ <productname>PostgreSQL</productname> distribution:
+ <itemizedlist>
+ <listitem>
+ <para><literal>SYSTEM</literal></para>
+ </listitem>
+ <listitem>
+ <para><literal>BERNOULLI</literal></para>
+ </listitem>
+ </itemizedlist>
+ Both of these sampling methods currently accept only single argument
+ which is the percent (floating point from 0 to 100) of the rows to
+ be returned.
+ The <literal>SYSTEM</literal> sampling method does block level
+ sampling with each block having the same chance of being selected and
+ returns all rows from each selected block.
+ The <literal>BERNOULLI</literal> scans whole table and returns
+ individual rows with equal probability. Additional sampling methods
+ may be installed in the database via extensions.
+ </para>
+ <para>
+ The optional parameter <literal>REPEATABLE</literal> uses the seed
+ parameter, which can be a number or expression producing a number, as
+ a random seed for sampling. Note that subsequent commands may return
+ different results even if same <literal>REPEATABLE</literal> clause was
+ specified. This happens because <acronym>DML</acronym> statements and
+ maintenance operations such as <command>VACUUM</> may affect physical
+ distribution of data. The <function>setseed()</> function will not
+ affect the sampling result when the <literal>REPEATABLE</literal>
+ parameter is used.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><replaceable class="parameter">alias</replaceable></term>
<listitem>
<para>
@@ -1927,5 +1971,20 @@ SELECT distributors.* WHERE distributors.name = 'Westward';
<literal>ROWS FROM( ... )</> is an extension of the SQL standard.
</para>
</refsect2>
+
+ <refsect2>
+ <title><literal>TABLESAMPLE</literal> clause</title>
+
+ <para>
+ The <literal>TABLESAMPLE</> clause is currently accepted only on physical
+ relations and materialized views.
+ </para>
+
+ <para>
+ Additional modules allow you to install custom sampling methods and use
+ them instead of the SQL standard methods.
+ </para>
+ </refsect2>
+
</refsect1>
</refentry>