aboutsummaryrefslogtreecommitdiff
path: root/doc/src
diff options
context:
space:
mode:
Diffstat (limited to 'doc/src')
-rw-r--r--doc/src/sgml/catalogs.sgml13
-rw-r--r--doc/src/sgml/func.sgml82
-rw-r--r--doc/src/sgml/perform.sgml66
-rw-r--r--doc/src/sgml/planstats.sgml116
-rw-r--r--doc/src/sgml/ref/create_statistics.sgml35
5 files changed, 307 insertions, 5 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 45ed077654e..026c422cd22 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -6562,7 +6562,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
An array containing codes for the enabled statistic kinds;
valid values are:
<literal>d</literal> for n-distinct statistics,
- <literal>f</literal> for functional dependency statistics
+ <literal>f</literal> for functional dependency statistics, and
+ <literal>m</literal> for most common values (MCV) list statistics
</entry>
</row>
@@ -6585,6 +6586,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
</entry>
</row>
+ <row>
+ <entry><structfield>stxmcv</structfield></entry>
+ <entry><type>pg_mcv_list</type></entry>
+ <entry></entry>
+ <entry>
+ MCV (most-common values) list statistics, serialized as
+ <structname>pg_mcv_list</structname> type.
+ </entry>
+ </row>
+
</tbody>
</tgroup>
</table>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 1a014732919..d24901126f0 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -22174,4 +22174,86 @@ CREATE EVENT TRIGGER test_table_rewrite_oid
</sect2>
</sect1>
+ <sect1 id="functions-statistics">
+ <title>Statistics Information Functions</title>
+
+ <indexterm zone="functions-statistics">
+ <primary>function</primary>
+ <secondary>statistics</secondary>
+ </indexterm>
+
+ <para>
+ <productname>PostgreSQL</productname> provides a function to inspect complex
+ statistics defined using the <command>CREATE STATISTICS</command> command.
+ </para>
+
+ <sect2 id="functions-statistics-mcv">
+ <title>Inspecting MCV lists</title>
+
+ <indexterm>
+ <primary>pg_mcv_list_items</primary>
+ <secondary>pg_mcv_list</secondary>
+ </indexterm>
+
+ <para>
+ <function>pg_mcv_list_items</function> returns a list of all items
+ stored in a multi-column <acronym>MCV</acronym> list, and returns the
+ following columns:
+
+ <informaltable>
+ <tgroup cols="3">
+ <thead>
+ <row>
+ <entry>Name</entry>
+ <entry>Type</entry>
+ <entry>Description</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry><literal>index</literal></entry>
+ <entry><type>int</type></entry>
+ <entry>index of the item in the <acronym>MCV</acronym> list</entry>
+ </row>
+ <row>
+ <entry><literal>values</literal></entry>
+ <entry><type>text[]</type></entry>
+ <entry>values stored in the MCV item</entry>
+ </row>
+ <row>
+ <entry><literal>nulls</literal></entry>
+ <entry><type>boolean[]</type></entry>
+ <entry>flags identifying <literal>NULL</literal> values</entry>
+ </row>
+ <row>
+ <entry><literal>frequency</literal></entry>
+ <entry><type>double precision</type></entry>
+ <entry>frequency of this <acronym>MCV</acronym> item</entry>
+ </row>
+ <row>
+ <entry><literal>base_frequency</literal></entry>
+ <entry><type>double precision</type></entry>
+ <entry>base frequency of this <acronym>MCV</acronym> item</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </informaltable>
+ </para>
+
+ <para>
+ The <function>pg_mcv_list_items</function> function can be used like this:
+
+<programlisting>
+SELECT m.* FROM pg_statistic_ext,
+ pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts';
+</programlisting>
+
+ Values of the <type>pg_mcv_list</type> can be obtained only from the
+ <literal>pg_statistic_ext.stxmcv</literal> column.
+ </para>
+ </sect2>
+
+ </sect1>
+
</chapter>
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 783d708073d..a84be851593 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1285,6 +1285,72 @@ nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
plans. Otherwise, the <command>ANALYZE</command> cycles are just wasted.
</para>
</sect3>
+
+ <sect3>
+ <title>Multivariate MCV lists</title>
+
+ <para>
+ Another type of statistics stored for each column are most-common value
+ lists. This allows very accurate estimates for individual columns, but
+ may result in significant misestimates for queries with conditions on
+ multiple columns.
+ </para>
+
+ <para>
+ To improve such estimates, <command>ANALYZE</command> can collect MCV
+ lists on combinations of columns. Similarly to functional dependencies
+ and n-distinct coefficients, it's impractical to do this for every
+ possible column grouping. Even more so in this case, as the MCV list
+ (unlike functional dependencies and n-distinct coefficients) does store
+ the common column values. So data is collected only for those groups
+ of columns appearing together in a statistics object defined with the
+ <literal>mcv</literal> option.
+ </para>
+
+ <para>
+ Continuing the previous example, the MCV list for a table of ZIP codes
+ might look like the following (unlike for simpler types of statistics,
+ a function is required for inspection of MCV contents):
+
+<programlisting>
+CREATE STATISTICS stts3 (mcv) ON state, city FROM zipcodes;
+
+ANALYZE zipcodes;
+
+SELECT m.* FROM pg_statistic_ext,
+ pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts3';
+
+ index | values | nulls | frequency | base_frequency
+-------+------------------------+-------+-----------+----------------
+ 0 | {Washington, DC} | {f,f} | 0.003467 | 2.7e-05
+ 1 | {Apo, AE} | {f,f} | 0.003067 | 1.9e-05
+ 2 | {Houston, TX} | {f,f} | 0.002167 | 0.000133
+ 3 | {El Paso, TX} | {f,f} | 0.002 | 0.000113
+ 4 | {New York, NY} | {f,f} | 0.001967 | 0.000114
+ 5 | {Atlanta, GA} | {f,f} | 0.001633 | 3.3e-05
+ 6 | {Sacramento, CA} | {f,f} | 0.001433 | 7.8e-05
+ 7 | {Miami, FL} | {f,f} | 0.0014 | 6e-05
+ 8 | {Dallas, TX} | {f,f} | 0.001367 | 8.8e-05
+ 9 | {Chicago, IL} | {f,f} | 0.001333 | 5.1e-05
+ ...
+(99 rows)
+</programlisting>
+ This indicates that the most common combination of city and state is
+ Washington in DC, with actual frequency (in the sample) about 0.35%.
+ The base frequency of the combination (as computed from the simple
+ per-column frequencies) is only 0.0027%, resulting in two orders of
+ magnitude under-estimates.
+ </para>
+
+ <para>
+ It's advisable to create <acronym>MCV</acronym> statistics objects only
+ on combinations of columns that are actually used in conditions together,
+ and for which misestimation of the number of groups is resulting in bad
+ plans. Otherwise, the <command>ANALYZE</command> and planning cycles
+ are just wasted.
+ </para>
+ </sect3>
+
</sect2>
</sect1>
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml
index ef643ad0646..4b1d3f4952e 100644
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -455,7 +455,7 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
<secondary>multivariate</secondary>
</indexterm>
- <sect2>
+ <sect2 id="functional-dependencies">
<title>Functional Dependencies</title>
<para>
@@ -540,7 +540,7 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
</para>
</sect2>
- <sect2>
+ <sect2 id="multivariate-ndistinct-counts">
<title>Multivariate N-Distinct Counts</title>
<para>
@@ -585,6 +585,118 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
</para>
</sect2>
+
+ <sect2 id="mcv-lists">
+ <title>MCV lists</title>
+
+ <para>
+ As explained in <xref linkend="functional-dependencies"/>, functional
+ dependencies are very cheap and efficient type of statistics, but their
+ main limitation is their global nature (only tracking dependencies at
+ the column level, not between individual column values).
+ </para>
+
+ <para>
+ This section introduces multivariate variant of <acronym>MCV</acronym>
+ (most-common values) lists, a straightforward extension of the per-column
+ statistics described in <xref linkend="row-estimation-examples"/>. These
+ statistics address the limitation by storing individual values, but it is
+ naturally more expensive, both in terms of building the statistics in
+ <command>ANALYZE</command>, storage and planning time.
+ </para>
+
+ <para>
+ Let's look at the query from <xref linkend="functional-dependencies"/>
+ again, but this time with a <acronym>MCV</acronym> list created on the
+ same set of columns (be sure to drop the functional dependencies, to
+ make sure the planner uses the newly created statistics).
+
+<programlisting>
+DROP STATISTICS stts;
+CREATE STATISTICS stts2 (mcv) ON a, b FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------
+ Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual rows=100 loops=1)
+ Filter: ((a = 1) AND (b = 1))
+ Rows Removed by Filter: 9900
+</programlisting>
+
+ The estimate is as accurate as with the functional dependencies, mostly
+ thanks to the table being fairly small and having a simple distribution
+ with a low number of distinct values. Before looking at the second query,
+ which was not handled by functional dependencies particularly well,
+ let's inspect the <acronym>MCV</acronym> list a bit.
+ </para>
+
+ <para>
+ Inspecting the <acronym>MCV</acronym> list is possible using
+ <function>pg_mcv_list_items</function> set-returning function.
+
+<programlisting>
+SELECT m.* FROM pg_statistic_ext,
+ pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts2';
+ index | values | nulls | frequency | base_frequency
+-------+----------+-------+-----------+----------------
+ 0 | {0, 0} | {f,f} | 0.01 | 0.0001
+ 1 | {1, 1} | {f,f} | 0.01 | 0.0001
+ ...
+ 49 | {49, 49} | {f,f} | 0.01 | 0.0001
+ 50 | {50, 50} | {f,f} | 0.01 | 0.0001
+ ...
+ 97 | {97, 97} | {f,f} | 0.01 | 0.0001
+ 98 | {98, 98} | {f,f} | 0.01 | 0.0001
+ 99 | {99, 99} | {f,f} | 0.01 | 0.0001
+(100 rows)
+</programlisting>
+
+ This confirms there are 100 distinct combinations in the two columns, and
+ all of them are about equally likely (1% frequency for each one). The
+ base frequency is the frequency computed from per-column statistics, as if
+ there were no multi-column statistics. Had there been any null values in
+ either of the columns, this would be identified in the
+ <structfield>nulls</structfield> column.
+ </para>
+
+ <para>
+ When estimating the selectivity, the planner applies all the conditions
+ on items in the <acronym>MCV</acronym> list, and then sums the frequencies
+ of the matching ones. See <function>mcv_clauselist_selectivity</function>
+ in <filename>src/backend/statistics/mcv.c</filename> for details.
+ </para>
+
+ <para>
+ Compared to functional dependencies, <acronym>MCV</acronym> lists have two
+ major advantages. Firstly, the list stores actual values, making it possible
+ to decide which combinations are compatible.
+
+<programlisting>
+EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
+ QUERY PLAN
+---------------------------------------------------------------------------
+ Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual rows=0 loops=1)
+ Filter: ((a = 1) AND (b = 10))
+ Rows Removed by Filter: 10000
+</programlisting>
+
+ Secondly, <acronym>MCV</acronym> lists handle a wider range of clause types,
+ not just equality clauses like functional dependencies. See for example the
+ example range query, presented earlier:
+
+<programlisting>
+EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a &lt;= 49 AND b &gt; 49;
+ QUERY PLAN
+---------------------------------------------------------------------------
+ Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual rows=0 loops=1)
+ Filter: ((a &lt;= 49) AND (b &gt; 49))
+ Rows Removed by Filter: 10000
+</programlisting>
+
+ </para>
+
+ </sect2>
+
</sect1>
<sect1 id="planner-stats-security">
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
index 539f5bded54..ae1d8024a4e 100644
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -81,9 +81,10 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
<para>
A statistics kind to be computed in this statistics object.
Currently supported kinds are
- <literal>ndistinct</literal>, which enables n-distinct statistics, and
+ <literal>ndistinct</literal>, which enables n-distinct statistics,
<literal>dependencies</literal>, which enables functional
- dependency statistics.
+ dependency statistics, and <literal>mcv</literal> which enables
+ most-common values lists.
If this clause is omitted, all supported statistics kinds are
included in the statistics object.
For more information, see <xref linkend="planner-stats-extended"/>
@@ -164,6 +165,36 @@ EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
conditions are redundant and does not underestimate the row count.
</para>
+ <para>
+ Create table <structname>t2</structname> with two perfectly correlated columns
+ (containing identical data), and a MCV list on those columns:
+
+<programlisting>
+CREATE TABLE t2 (
+ a int,
+ b int
+);
+
+INSERT INTO t2 SELECT mod(i,100), mod(i,100)
+ FROM generate_series(1,1000000) s(i);
+
+CREATE STATISTICS s2 (mcv) ON (a, b) FROM t2;
+
+ANALYZE t2;
+
+-- valid combination (found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 1);
+
+-- invalid combination (not found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
+</programlisting>
+
+ The MCV list gives the planner more detailed information about the
+ specific values that commonly appear in the table, as well as an upper
+ bound on the selectivities of combinations of values that do not appear
+ in the table, allowing it to generate better estimates in both cases.
+ </para>
+
</refsect1>
<refsect1>