diff options
57 files changed, 6807 insertions, 24 deletions
diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile index f10229db482..a59de8aba9e 100644 --- a/contrib/pageinspect/Makefile +++ b/contrib/pageinspect/Makefile @@ -1,10 +1,11 @@ # contrib/pageinspect/Makefile MODULE_big = pageinspect -OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o $(WIN32RES) +OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o brinfuncs.o $(WIN32RES) EXTENSION = pageinspect -DATA = pageinspect--1.2.sql pageinspect--1.0--1.1.sql \ +DATA = pageinspect--1.3.sql pageinspect--1.0--1.1.sql \ + pageinspect--1.2--1.3.sql \ pageinspect--1.1--1.2.sql pageinspect--unpackaged--1.0.sql PGFILEDESC = "pageinspect - functions to inspect contents of database pages" diff --git a/contrib/pageinspect/brinfuncs.c b/contrib/pageinspect/brinfuncs.c new file mode 100644 index 00000000000..359fc1d3ac6 --- /dev/null +++ b/contrib/pageinspect/brinfuncs.c @@ -0,0 +1,414 @@ +/* + * brinfuncs.c + * Functions to investigate BRIN indexes + * + * Copyright (c) 2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pageinspect/brinfuncs.c + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/brin.h" +#include "access/brin_internal.h" +#include "access/brin_page.h" +#include "access/brin_revmap.h" +#include "access/brin_tuple.h" +#include "catalog/index.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "lib/stringinfo.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "miscadmin.h" + + +PG_FUNCTION_INFO_V1(brin_page_type); +PG_FUNCTION_INFO_V1(brin_page_items); +PG_FUNCTION_INFO_V1(brin_metapage_info); +PG_FUNCTION_INFO_V1(brin_revmap_data); + +typedef struct brin_column_state +{ + int nstored; + FmgrInfo outputFn[FLEXIBLE_ARRAY_MEMBER]; +} brin_column_state; + +typedef struct brin_page_state +{ + BrinDesc *bdesc; + Page page; + OffsetNumber offset; + bool unusedItem; + bool done; + AttrNumber attno; + BrinMemTuple *dtup; + brin_column_state *columns[FLEXIBLE_ARRAY_MEMBER]; +} brin_page_state; + + +static Page verify_brin_page(bytea *raw_page, uint16 type, + const char *strtype); + +Datum +brin_page_type(PG_FUNCTION_ARGS) +{ + bytea *raw_page = PG_GETARG_BYTEA_P(0); + Page page = VARDATA(raw_page); + BrinSpecialSpace *special; + char *type; + + special = (BrinSpecialSpace *) PageGetSpecialPointer(page); + + switch (special->type) + { + case BRIN_PAGETYPE_META: + type = "meta"; + break; + case BRIN_PAGETYPE_REVMAP: + type = "revmap"; + break; + case BRIN_PAGETYPE_REGULAR: + type = "regular"; + break; + default: + type = psprintf("unknown (%02x)", special->type); + break; + } + + PG_RETURN_TEXT_P(cstring_to_text(type)); +} + +/* + * Verify that the given bytea contains a BRIN page of the indicated page + * type, or die in the attempt. A pointer to the page is returned. + */ +static Page +verify_brin_page(bytea *raw_page, uint16 type, const char *strtype) +{ + Page page; + int raw_page_size; + BrinSpecialSpace *special; + + raw_page_size = VARSIZE(raw_page) - VARHDRSZ; + + if (raw_page_size < SizeOfPageHeaderData) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("input page too small"), + errdetail("Expected size %d, got %d", raw_page_size, BLCKSZ))); + + page = VARDATA(raw_page); + + /* verify the special space says this page is what we want */ + special = (BrinSpecialSpace *) PageGetSpecialPointer(page); + if (special->type != type) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("page is not a BRIN page of type \"%s\"", strtype), + errdetail("Expected special type %08x, got %08x.", + type, special->type))); + + return page; +} + + +/* + * Extract all item values from a BRIN index page + * + * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); + */ +Datum +brin_page_items(PG_FUNCTION_ARGS) +{ + brin_page_state *state; + FuncCallContext *fctx; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw page functions")))); + + if (SRF_IS_FIRSTCALL()) + { + bytea *raw_page = PG_GETARG_BYTEA_P(0); + Oid indexRelid = PG_GETARG_OID(1); + Page page; + TupleDesc tupdesc; + MemoryContext mctx; + Relation indexRel; + AttrNumber attno; + + /* minimally verify the page we got */ + page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); + + /* create a function context for cross-call persistence */ + fctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + indexRel = index_open(indexRelid, AccessShareLock); + + state = palloc(offsetof(brin_page_state, columns) + + sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts); + + state->bdesc = brin_build_desc(indexRel); + state->page = page; + state->offset = FirstOffsetNumber; + state->unusedItem = false; + state->done = false; + state->dtup = NULL; + + /* + * Initialize output functions for all indexed datatypes; simplifies + * calling them later. + */ + for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++) + { + Oid output; + bool isVarlena; + BrinOpcInfo *opcinfo; + int i; + brin_column_state *column; + + opcinfo = state->bdesc->bd_info[attno - 1]; + column = palloc(offsetof(brin_column_state, outputFn) + + sizeof(FmgrInfo) * opcinfo->oi_nstored); + + column->nstored = opcinfo->oi_nstored; + for (i = 0; i < opcinfo->oi_nstored; i++) + { + getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena); + fmgr_info(output, &column->outputFn[i]); + } + + state->columns[attno - 1] = column; + } + + index_close(indexRel, AccessShareLock); + + fctx->user_fctx = state; + fctx->tuple_desc = BlessTupleDesc(tupdesc); + + MemoryContextSwitchTo(mctx); + } + + fctx = SRF_PERCALL_SETUP(); + state = fctx->user_fctx; + + if (!state->done) + { + HeapTuple result; + Datum values[7]; + bool nulls[7]; + + /* + * This loop is called once for every attribute of every tuple in the + * page. At the start of a tuple, we get a NULL dtup; that's our + * signal for obtaining and decoding the next one. If that's not the + * case, we output the next attribute. + */ + if (state->dtup == NULL) + { + BrinTuple *tup; + MemoryContext mctx; + ItemId itemId; + + /* deformed tuple must live across calls */ + mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + /* verify item status: if there's no data, we can't decode */ + itemId = PageGetItemId(state->page, state->offset); + if (ItemIdIsUsed(itemId)) + { + tup = (BrinTuple *) PageGetItem(state->page, + PageGetItemId(state->page, + state->offset)); + state->dtup = brin_deform_tuple(state->bdesc, tup); + state->attno = 1; + state->unusedItem = false; + } + else + state->unusedItem = true; + + MemoryContextSwitchTo(mctx); + } + else + state->attno++; + + MemSet(nulls, 0, sizeof(nulls)); + + if (state->unusedItem) + { + values[0] = UInt16GetDatum(state->offset); + nulls[1] = true; + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + } + else + { + int att = state->attno - 1; + + values[0] = UInt16GetDatum(state->offset); + values[1] = UInt32GetDatum(state->dtup->bt_blkno); + values[2] = UInt16GetDatum(state->attno); + values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls); + values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls); + values[5] = BoolGetDatum(state->dtup->bt_placeholder); + if (!state->dtup->bt_columns[att].bv_allnulls) + { + BrinValues *bvalues = &state->dtup->bt_columns[att]; + StringInfoData s; + bool first; + int i; + + initStringInfo(&s); + appendStringInfoChar(&s, '{'); + + first = true; + for (i = 0; i < state->columns[att]->nstored; i++) + { + char *val; + + if (!first) + appendStringInfoString(&s, " .. "); + first = false; + val = OutputFunctionCall(&state->columns[att]->outputFn[i], + bvalues->bv_values[i]); + appendStringInfoString(&s, val); + pfree(val); + } + appendStringInfoChar(&s, '}'); + + values[6] = CStringGetTextDatum(s.data); + pfree(s.data); + } + else + { + nulls[6] = true; + } + } + + result = heap_form_tuple(fctx->tuple_desc, values, nulls); + + /* + * If the item was unused, jump straight to the next one; otherwise, + * the only cleanup needed here is to set our signal to go to the next + * tuple in the following iteration, by freeing the current one. + */ + if (state->unusedItem) + state->offset = OffsetNumberNext(state->offset); + else if (state->attno >= state->bdesc->bd_tupdesc->natts) + { + pfree(state->dtup); + state->dtup = NULL; + state->offset = OffsetNumberNext(state->offset); + } + + /* + * If we're beyond the end of the page, set flag to end the function in + * the following iteration. + */ + if (state->offset > PageGetMaxOffsetNumber(state->page)) + state->done = true; + + SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result)); + } + + brin_free_desc(state->bdesc); + + SRF_RETURN_DONE(fctx); +} + +Datum +brin_metapage_info(PG_FUNCTION_ARGS) +{ + bytea *raw_page = PG_GETARG_BYTEA_P(0); + Page page; + BrinMetaPageData *meta; + TupleDesc tupdesc; + Datum values[4]; + bool nulls[4]; + HeapTuple htup; + + page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage"); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + tupdesc = BlessTupleDesc(tupdesc); + + /* Extract values from the metapage */ + meta = (BrinMetaPageData *) PageGetContents(page); + MemSet(nulls, 0, sizeof(nulls)); + values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic)); + values[1] = Int32GetDatum(meta->brinVersion); + values[2] = Int32GetDatum(meta->pagesPerRange); + values[3] = Int64GetDatum(meta->lastRevmapPage); + + htup = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(htup)); +} + +/* + * Return the TID array stored in a BRIN revmap page + */ +Datum +brin_revmap_data(PG_FUNCTION_ARGS) +{ + struct + { + ItemPointerData *tids; + int idx; + } *state; + FuncCallContext *fctx; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw page functions")))); + + if (SRF_IS_FIRSTCALL()) + { + bytea *raw_page = PG_GETARG_BYTEA_P(0); + MemoryContext mctx; + Page page; + + /* minimally verify the page we got */ + page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap"); + + /* create a function context for cross-call persistence */ + fctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + state = palloc(sizeof(*state)); + state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids; + state->idx = 0; + + fctx->user_fctx = state; + + MemoryContextSwitchTo(mctx); + } + + fctx = SRF_PERCALL_SETUP(); + state = fctx->user_fctx; + + if (state->idx < REVMAP_PAGE_MAXITEMS) + SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++])); + + SRF_RETURN_DONE(fctx); +} diff --git a/contrib/pageinspect/pageinspect--1.2--1.3.sql b/contrib/pageinspect/pageinspect--1.2--1.3.sql new file mode 100644 index 00000000000..9bc4dded0f4 --- /dev/null +++ b/contrib/pageinspect/pageinspect--1.2--1.3.sql @@ -0,0 +1,43 @@ +/* contrib/pageinspect/pageinspect--1.2--1.3.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.3'" to load this file. \quit + +-- +-- brin_page_type() +-- +CREATE FUNCTION brin_page_type(IN page bytea) +RETURNS text +AS 'MODULE_PATHNAME', 'brin_page_type' +LANGUAGE C STRICT; + +-- +-- brin_metapage_info() +-- +CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text, + OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint) +AS 'MODULE_PATHNAME', 'brin_metapage_info' +LANGUAGE C STRICT; + +-- +-- brin_revmap_data() +CREATE FUNCTION brin_revmap_data(IN page bytea, + OUT pages tid) +RETURNS SETOF tid +AS 'MODULE_PATHNAME', 'brin_revmap_data' +LANGUAGE C STRICT; + +-- +-- brin_page_items() +-- +CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass, + OUT itemoffset int, + OUT blknum int, + OUT attnum int, + OUT allnulls bool, + OUT hasnulls bool, + OUT placeholder bool, + OUT value text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'brin_page_items' +LANGUAGE C STRICT; diff --git a/contrib/pageinspect/pageinspect--1.2.sql b/contrib/pageinspect/pageinspect--1.3.sql index 15e8e1e3811..856dcdfb592 100644 --- a/contrib/pageinspect/pageinspect--1.2.sql +++ b/contrib/pageinspect/pageinspect--1.3.sql @@ -1,4 +1,4 @@ -/* contrib/pageinspect/pageinspect--1.2.sql */ +/* contrib/pageinspect/pageinspect--1.3.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pageinspect" to load this file. \quit @@ -99,6 +99,45 @@ AS 'MODULE_PATHNAME', 'bt_page_items' LANGUAGE C STRICT; -- +-- brin_page_type() +-- +CREATE FUNCTION brin_page_type(IN page bytea) +RETURNS text +AS 'MODULE_PATHNAME', 'brin_page_type' +LANGUAGE C STRICT; + +-- +-- brin_metapage_info() +-- +CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text, + OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint) +AS 'MODULE_PATHNAME', 'brin_metapage_info' +LANGUAGE C STRICT; + +-- +-- brin_revmap_data() +CREATE FUNCTION brin_revmap_data(IN page bytea, + OUT pages tid) +RETURNS SETOF tid +AS 'MODULE_PATHNAME', 'brin_revmap_data' +LANGUAGE C STRICT; + +-- +-- brin_page_items() +-- +CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass, + OUT itemoffset int, + OUT blknum int, + OUT attnum int, + OUT allnulls bool, + OUT hasnulls bool, + OUT placeholder bool, + OUT value text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'brin_page_items' +LANGUAGE C STRICT; + +-- -- fsm_page_contents() -- CREATE FUNCTION fsm_page_contents(IN page bytea) diff --git a/contrib/pageinspect/pageinspect.control b/contrib/pageinspect/pageinspect.control index aecd91a711b..a9dab3327c9 100644 --- a/contrib/pageinspect/pageinspect.control +++ b/contrib/pageinspect/pageinspect.control @@ -1,5 +1,5 @@ # pageinspect extension comment = 'inspect the contents of database pages at a low level' -default_version = '1.2' +default_version = '1.3' module_pathname = '$libdir/pageinspect' relocatable = true diff --git a/contrib/pg_xlogdump/rmgrdesc.c b/contrib/pg_xlogdump/rmgrdesc.c index bfb35738789..93971982390 100644 --- a/contrib/pg_xlogdump/rmgrdesc.c +++ b/contrib/pg_xlogdump/rmgrdesc.c @@ -8,6 +8,7 @@ #define FRONTEND 1 #include "postgres.h" +#include "access/brin_xlog.h" #include "access/clog.h" #include "access/gin.h" #include "access/gist_private.h" diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml new file mode 100644 index 00000000000..03d1fd62783 --- /dev/null +++ b/doc/src/sgml/brin.sgml @@ -0,0 +1,490 @@ +<!-- doc/src/sgml/brin.sgml --> + +<chapter id="BRIN"> +<title>BRIN Indexes</title> + + <indexterm> + <primary>index</primary> + <secondary>BRIN</secondary> + </indexterm> + +<sect1 id="brin-intro"> + <title>Introduction</title> + + <para> + <acronym>BRIN</acronym> stands for Block Range Index. + <acronym>BRIN</acronym> is designed for handling very large tables + in which certain columns have some natural correlation with their + physical location within the table. + A <firstterm>block range</> is a group of pages that are physically + adjacent in the table; for each block range, some summary info is stored + by the index. + For example, a table storing a store's sale orders might have + a date column on which each order was placed, and most of the time + the entries for earlier orders will appear earlier in the table as well; + a table storing a ZIP code column might have all codes for a city + grouped together naturally. + </para> + + <para> + <acronym>BRIN</acronym> indexes can satisfy queries via regular bitmap + index scans, and will return all tuples in all pages within each range if + the summary info stored by the index is <firstterm>consistent</> with the + query conditions. + The query executor is in charge of rechecking these tuples and discarding + those that do not match the query conditions — in other words, these + indexes are lossy. + Because a <acronym>BRIN</acronym> index is very small, scanning the index + adds little overhead compared to a sequential scan, but may avoid scanning + large parts of the table that are known not to contain matching tuples. + </para> + + <para> + The specific data that a <acronym>BRIN</acronym> index will store, + as well as the specific queries that the index will be able to satisfy, + depend on the operator class selected for each column of the index. + Data types having a linear sort order can have operator classes that + store the minimum and maximum value within each block range, for instance; + geometrical types might store the bounding box for all the objects + in the block range. + </para> + + <para> + The size of the block range is determined at index creation time by + the <literal>pages_per_range</> storage parameter. The number of index + entries will be equal to the size of the relation in pages divided by + the selected value for <literal>pages_per_range</>. Therefore, the smaller + the number, the larger the index becomes (because of the need to + store more index entries), but at the same time the summary data stored can + be more precise and more data blocks can be skipped during an index scan. + </para> +</sect1> + +<sect1 id="brin-builtin-opclasses"> + <title>Built-in Operator Classes</title> + + <para> + The core <productname>PostgreSQL</productname> distribution includes + includes the <acronym>BRIN</acronym> operator classes shown in + <xref linkend="brin-builtin-opclasses-table">. + </para> + + <para> + The <firstterm>minmax</> + operator classes store the minimum and the maximum values appearing + in the indexed column within the range. + </para> + + <table id="brin-builtin-opclasses-table"> + <title>Built-in <acronym>BRIN</acronym> Operator Classes</title> + <tgroup cols="3"> + <thead> + <row> + <entry>Name</entry> + <entry>Indexed Data Type</entry> + <entry>Indexable Operators</entry> + </row> + </thead> + <tbody> + <row> + <entry><literal>bytea_minmax_ops</literal></entry> + <entry><type>bytea</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>char_minmax_ops</literal></entry> + <entry><type>"char"</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>name_minmax_ops</literal></entry> + <entry><type>name</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>int8_minmax_ops</literal></entry> + <entry><type>bigint</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>int2_minmax_ops</literal></entry> + <entry><type>smallint</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>int4_minmax_ops</literal></entry> + <entry><type>integer</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>text_minmax_ops</literal></entry> + <entry><type>text</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>oid_minmax_ops</literal></entry> + <entry><type>oid</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>tid_minmax_ops</literal></entry> + <entry><type>tid</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>float4_minmax_ops</literal></entry> + <entry><type>real</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>float8_minmax_ops</literal></entry> + <entry><type>double precision</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>abstime_minmax_ops</literal></entry> + <entry><type>abstime</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>reltime_minmax_ops</literal></entry> + <entry><type>reltime</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>macaddr_minmax_ops</literal></entry> + <entry><type>macaddr</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>inet_minmax_ops</literal></entry> + <entry><type>inet</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>bpchar_minmax_ops</literal></entry> + <entry><type>character</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>date_minmax_ops</literal></entry> + <entry><type>date</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>time_minmax_ops</literal></entry> + <entry><type>time without time zone</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>timestamp_minmax_ops</literal></entry> + <entry><type>timestamp without time zone</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>timestamptz_minmax_ops</literal></entry> + <entry><type>timestamp with time zone</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>interval_minmax_ops</literal></entry> + <entry><type>interval</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>timetz_minmax_ops</literal></entry> + <entry><type>time with time zone</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>bit_minmax_ops</literal></entry> + <entry><type>bit</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>varbit_minmax_ops</literal></entry> + <entry><type>bit varying</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>numeric_minmax_ops</literal></entry> + <entry><type>numeric</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>uuid_minmax_ops</literal></entry> + <entry><type>uuid</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + <row> + <entry><literal>pg_lsn_minmax_ops</literal></entry> + <entry><type>pg_lsn</type></entry> + <entry> + <literal><</literal> + <literal><=</literal> + <literal>=</literal> + <literal>>=</literal> + <literal>></literal> + </entry> + </row> + </tbody> + </tgroup> + </table> +</sect1> + +<sect1 id="brin-extensibility"> + <title>Extensibility</title> + + <para> + The <acronym>BRIN</acronym> interface has a high level of abstraction, + requiring the access method implementer only to implement the semantics + of the data type being accessed. The <acronym>BRIN</acronym> layer + itself takes care of concurrency, logging and searching the index structure. + </para> + + <para> + All it takes to get a <acronym>BRIN</acronym> access method working is to + implement a few user-defined methods, which define the behavior of + summary values stored in the index and the way they interact with + scan keys. + In short, <acronym>BRIN</acronym> combines + extensibility with generality, code reuse, and a clean interface. + </para> + + <para> + There are four methods that an operator class for <acronym>BRIN</acronym> + must provide: + + <variablelist> + <varlistentry> + <term><function>BrinOpcInfo *opcInfo(Oid type_oid)</></term> + <listitem> + <para> + Returns internal information about the indexed columns' summary data. + The return value must point to a palloc'd <structname>BrinOpcInfo</>, + which has this definition: +<programlisting> +typedef struct BrinOpcInfo +{ + /* Number of columns stored in an index column of this opclass */ + uint16 oi_nstored; + + /* Opaque pointer for the opclass' private use */ + void *oi_opaque; + + /* Type IDs of the stored columns */ + Oid oi_typids[FLEXIBLE_ARRAY_MEMBER]; +} BrinOpcInfo; +</programlisting> + <structname>BrinOpcInfo</>.<structfield>oi_opaque</> can be used by the + operator class routines to pass information between support procedures + during an index scan. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>bool consistent(BrinDesc *bdesc, BrinValues *column, + ScanKey key)</function></term> + <listitem> + <para> + Returns whether the ScanKey is consistent with the given indexed + values for a range. + The attribute number to use is passed as part of the scan key. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>bool addValue(BrinDesc *bdesc, BrinValues *column, + Datum newval, bool isnull)</function></term> + <listitem> + <para> + Given an index tuple and an indexed value, modifies the indicated + attribute of the tuple so that it additionally represents the new value. + If any modification was done to the tuple, <literal>true</literal> is + returned. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><function>bool unionTuples(BrinDesc *bdesc, BrinValues *a, + BrinValues *b)</function></term> + <listitem> + <para> + Consolidates two index tuples. Given two index tuples, modifies the + indicated attribute of the first of them so that it represents both tuples. + The second tuple is not modified. + </para> + </listitem> + </varlistentry> + </variablelist> + + To implement these methods in a generic way, the operator class + defines its own internal support functions. + (For instance, <quote>min/max</> operator classes implements + support functions for the four inequality operators for the data type.) + Additionally, the operator class must supply appropriate + operator entries, + to enable the optimizer to use the index when those operators are + used in queries. + </para> +</sect1> +</chapter> diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 5902f979c89..f03b72ab1db 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -87,6 +87,7 @@ <!ENTITY gist SYSTEM "gist.sgml"> <!ENTITY spgist SYSTEM "spgist.sgml"> <!ENTITY gin SYSTEM "gin.sgml"> +<!ENTITY brin SYSTEM "brin.sgml"> <!ENTITY planstats SYSTEM "planstats.sgml"> <!ENTITY indexam SYSTEM "indexam.sgml"> <!ENTITY nls SYSTEM "nls.sgml"> diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 64530a11c86..b73463a3238 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -116,7 +116,8 @@ CREATE INDEX test1_id_index ON test1 (id); <para> <productname>PostgreSQL</productname> provides several index types: - B-tree, Hash, GiST, SP-GiST and GIN. Each index type uses a different + B-tree, Hash, GiST, SP-GiST, GIN and BRIN. + Each index type uses a different algorithm that is best suited to different types of queries. By default, the <command>CREATE INDEX</command> command creates B-tree indexes, which fit the most common situations. @@ -326,6 +327,39 @@ SELECT * FROM places ORDER BY location <-> point '(101,456)' LIMIT 10; classes are available in the <literal>contrib</> collection or as separate projects. For more information see <xref linkend="GIN">. </para> + + <para> + <indexterm> + <primary>index</primary> + <secondary>BRIN</secondary> + </indexterm> + <indexterm> + <primary>BRIN</primary> + <see>index</see> + </indexterm> + BRIN indexes (a shorthand for Block Range indexes) + store summaries about the values stored in consecutive table physical block ranges. + Like GiST, SP-GiST and GIN, + BRIN can support many different indexing strategies, + and the particular operators with which a BRIN index can be used + vary depending on the indexing strategy. + For datatypes that have a linear sort order, the indexed data + corresponds to the minimum and maximum values of the + values in the column for each block range, + which support indexed queries using these operators: + + <simplelist> + <member><literal><</literal></member> + <member><literal><=</literal></member> + <member><literal>=</literal></member> + <member><literal>>=</literal></member> + <member><literal>></literal></member> + </simplelist> + + The BRIN operator classes included in the standard distribution are + documented in <xref linkend="brin-builtin-opclasses-table">. + For more information see <xref linkend="BRIN">. + </para> </sect1> diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 191fb156c13..70517ac4e17 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -198,6 +198,110 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1); <varlistentry> <term> + <function>brin_page_type(page bytea) returns text</function> + <indexterm> + <primary>brin_page_type</primary> + </indexterm> + </term> + + <listitem> + <para> + <function>brin_page_type</function> returns the page type of the given + <acronym>BRIN</acronym> index page, or throws an error if the page is + not a valid <acronym>BRIN</acronym> page. For example: +<screen> +brintest=# select brin_page_type(get_raw_page('brinidx', 0)); + brin_page_type +---------------- + meta +</screen> + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term> + <function>brin_metapage_info(page bytea) returns record</function> + <indexterm> + <primary>brin_metapage_info</primary> + </indexterm> + </term> + + <listitem> + <para> + <function>brin_metapage_info</function> returns assorted information + about a <acronym>BRIN</acronym> index metapage. For example: +<screen> +brintest=# select * from brin_metapage_info(get_raw_page('brinidx', 0)); + magic | version | pagesperrange | lastrevmappage +------------+---------+---------------+---------------- + 0xA8109CFA | 1 | 4 | 2 +</screen> + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term> + <function>brin_revmap_data(page bytea) returns setof tid</function> + <indexterm> + <primary>brin_revmap_data</primary> + </indexterm> + </term> + + <listitem> + <para> + <function>brin_revmap_data</function> returns the list of tuple + identifiers in a <acronym>BRIN</acronym> index range map page. + For example: +<screen> +brintest=# select * from brin_revmap_data(get_raw_page('brinidx', 2)) limit 5; + pages +--------- + (6,137) + (6,138) + (6,139) + (6,140) + (6,141) +</screen> + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term> + <function>brin_page_items(page bytea, index oid) returns setof record</function> + <indexterm> + <primary>brin_page_items</primary> + </indexterm> + </term> + + <listitem> + <para> + <function>brin_page_items</function> returns the data stored in the + <acronym>BRIN</acronym> data page. For example: +<screen> +brintest=# select * from brin_page_items(get_raw_page('brinidx', 5), +brintest(# 'brinidx') +brintest-# order by blknum, attnum limit 6; + itemoffset | blknum | attnum | allnulls | hasnulls | placeholder | value +------------+--------+--------+----------+----------+-------------+-------------- + 137 | 0 | 1 | t | f | f | + 137 | 0 | 2 | f | f | f | {1 .. 88} + 138 | 4 | 1 | t | f | f | + 138 | 4 | 2 | f | f | f | {89 .. 176} + 139 | 8 | 1 | t | f | f | + 139 | 8 | 2 | f | f | f | {177 .. 264} +</screen> + The returned columns correspond to the fields in the + <structname>BrinMemTuple</> and <structname>BrinValues</> structs. + See <filename>src/include/access/brin_tuple.h</> for details. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term> <function>fsm_page_contents(page bytea) returns text</function> <indexterm> <primary>fsm_page_contents</primary> diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index 9bde1085e9b..a648a4c5f64 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -247,6 +247,7 @@ &gist; &spgist; &gin; + &brin; &storage; &bki; &planstats; diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index c32088f81df..21721b48f04 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,6 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam +SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/brin/Makefile b/src/backend/access/brin/Makefile new file mode 100644 index 00000000000..ac44fcdee39 --- /dev/null +++ b/src/backend/access/brin/Makefile @@ -0,0 +1,18 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/brin +# +# IDENTIFICATION +# src/backend/access/brin/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/brin +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = brin.o brin_pageops.o brin_revmap.o brin_tuple.o brin_xlog.o \ + brin_minmax.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/brin/README b/src/backend/access/brin/README new file mode 100644 index 00000000000..2619be8db56 --- /dev/null +++ b/src/backend/access/brin/README @@ -0,0 +1,189 @@ +Block Range Indexes (BRIN) +========================== + +BRIN indexes intend to enable very fast scanning of extremely large tables. + +The essential idea of a BRIN index is to keep track of summarizing values in +consecutive groups of heap pages (page ranges); for example, the minimum and +maximum values for datatypes with a btree opclass, or the bounding box for +geometric types. These values can be used to avoid scanning such pages +during a table scan, depending on query quals. + +The cost of this is having to update the stored summary values of each page +range as tuples are inserted into them. + + +Access Method Design +-------------------- + +Since item pointers are not stored inside indexes of this type, it is not +possible to support the amgettuple interface. Instead, we only provide +amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap +comprising all pages in those page ranges that match the query +qualifications. The recheck step in the BitmapHeapScan node prunes tuples +that are not visible according to the query qualifications. + +An operator class must have the following entries: + +- generic support procedures (pg_amproc), identical to all opclasses: + * "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index + creation or scanning + * "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item, + and possibly changes the index tuple so that it includes the heap item + values + * "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query + quals, and returns whether the index tuple values match the query quals. + * "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first + one so that it represents the union of the two. +Procedure numbers up to 10 are reserved for future expansion. + +Additionally, each opclass needs additional support functions: +- Minmax-style operator classes: + * Proc numbers 11-14 are used for the functions implementing inequality + operators for the type, in this order: less than, less or equal, + greater or equal, greater than. + +Opclasses using a different design will require different additional procedure +numbers. + +Operator classes also need to have operator (pg_amop) entries so that the +optimizer can choose the index to execute queries. +- Minmax-style operator classes: + * The same operators as btree (<=, <, =, >=, >) + +Each index tuple stores some NULL bits and some opclass-specified values, which +are stored in a single null bitmask of length twice the number of columns. The +generic NULL bits indicate, for each column: + * bt_hasnulls: Whether there's any NULL value at all in the page range + * bt_allnulls: Whether all values are NULLs in the page range + +The opclass-specified values are: +- Minmax-style operator classes + * minimum value across all tuples in the range + * maximum value across all tuples in the range + +Note that the addValue and Union support procedures must be careful to +datumCopy() the values they want to store in the in-memory BRIN tuple, and +must pfree() the old copies when replacing older ones. Since some values +referenced from the tuple persist and others go away, there is no +well-defined lifetime for a memory context that would make this automatic. + + +The Range Map +------------- + +To find the index tuple for a particular page range, we have an internal +structure we call the range map, or "revmap" for short. This stores one TID +per page range, which is the address of the index tuple summarizing that +range. Since the map entries are fixed size, it is possible to compute the +address of the range map entry for any given heap page by simple arithmetic. + +When a new heap tuple is inserted in a summarized page range, we compare the +existing index tuple with the new heap tuple. If the heap tuple is outside +the summarization data given by the index tuple for any indexed column (or +if the new heap tuple contains null values but the index tuple indicates +there are no nulls), the index is updated with the new values. In many +cases it is possible to update the index tuple in-place, but if the new +index tuple is larger than the old one and there's not enough space in the +page, it is necessary to create a new index tuple with the new values. The +range map can be updated quickly to point to it; the old index tuple is +removed. + +If the range map points to an invalid TID, the corresponding page range is +considered to be not summarized. When tuples are added to unsummarized +pages, nothing needs to happen. + +To scan a table following a BRIN index, we scan the range map sequentially. +This yields index tuples in ascending page range order. Query quals are +matched to each index tuple; if they match, each page within the page range +is returned as part of the output TID bitmap. If there's no match, they are +skipped. Range map entries returning invalid index TIDs, that is +unsummarized page ranges, are also returned in the TID bitmap. + +The revmap is stored in the first few blocks of the index main fork, +immediately following the metapage. Whenever the revmap needs to be +extended by another page, existing tuples in that page are moved to some +other page. + +Heap tuples can be removed from anywhere without restriction. It might be +useful to mark the corresponding index tuple somehow, if the heap tuple is +one of the constraining values of the summary data (i.e. either min or max +in the case of a btree-opclass-bearing datatype), so that in the future we +are aware of the need to re-execute summarization on that range, leading to +a possible tightening of the summary values. + +Summarization +------------- + +At index creation time, the whole table is scanned; for each page range the +summarizing values of each indexed column and nulls bitmap are collected and +stored in the index. The partially-filled page range at the end of the +table is also summarized. + +As new tuples get inserted at the end of the table, they may update the +index tuple that summarizes the partial page range at the end. Eventually +that page range is complete and new tuples belong in a new page range that +hasn't yet been summarized. Those insertions do not create a new index +entry; instead, the page range remains unsummarized until later. + +Wehn VACUUM is run on the table, all unsummarized page ranges are +summarized. This action can also be invoked by the user via +brin_summarize_new_values(). Both these procedures scan all the +unsummarized ranges, and create a summary tuple. Again, this includes the +partially-filled page range at the end of the table. + +Vacuuming +--------- + +Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the +index when heap tuples are removed. It might be that some summary values can +be tightened if heap tuples have been deleted; but this would represent an +optimization opportunity only, not a correctness issue. It's simpler to +represent this as the need to re-run summarization on the affected page range +rather than "subtracting" values from the existing one. This is not +currently implemented. + +Note that if there are no indexes on the table other than the BRIN index, +usage of maintenance_work_mem by vacuum can be decreased significantly, because +no detailed index scan needs to take place (and thus it's not necessary for +vacuum to save TIDs to remove). It's unlikely that BRIN would be the only +indexes in a table, though, because primary keys can be btrees only, and so +we don't implement this optimization. + + +Optimizer +--------- + +The optimizer selects the index based on the operator class' pg_amop +entries for the column. + + +Future improvements +------------------- + +* Different-size page ranges? + In the current design, each "index entry" in a BRIN index covers the same + number of pages. There's no hard reason for this; it might make sense to + allow the index to self-tune so that some index entries cover smaller page + ranges, if this allows the summary values to be more compact. This would incur + larger BRIN overhead for the index itself, but might allow better pruning of + page ranges during scan. In the limit of one index tuple per page, the index + itself would occupy too much space, even though we would be able to skip + reading the most heap pages, because the summary values are tight; in the + opposite limit of a single tuple that summarizes the whole table, we wouldn't + be able to prune anything even though the index is very small. This can + probably be made to work by using the range map as an index in itself. + +* More compact representation for TIDBitmap? + TIDBitmap is the structure used to represent bitmap scans. The + representation of lossy page ranges is not optimal for our purposes, because + it uses a Bitmapset to represent pages in the range; since we're going to return + all pages in a large range, it might be more convenient to allow for a + struct that uses start and end page numbers to represent the range, instead. + +* Better vacuuming? + It might be useful to enable passing more useful info to BRIN indexes during + vacuuming about tuples that are deleted, i.e. do not require the callback to + pass each tuple's TID. For instance we might need a callback that passes a + block number instead of a TID. That would help determine when to re-run + summarization on blocks that have seen lots of tuple deletions. diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c new file mode 100644 index 00000000000..76cc36c3469 --- /dev/null +++ b/src/backend/access/brin/brin.c @@ -0,0 +1,1228 @@ +/* + * brin.c + * Implementation of BRIN indexes for Postgres + * + * See src/backend/access/brin/README for details. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin.c + * + * TODO + * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY) + */ +#include "postgres.h" + +#include "access/brin.h" +#include "access/brin_internal.h" +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + + +/* + * We use a BrinBuildState during initial construction of a BRIN index. + * The running state is kept in a BrinMemTuple. + */ +typedef struct BrinBuildState +{ + Relation bs_irel; + int bs_numtuples; + Buffer bs_currentInsertBuf; + BlockNumber bs_pagesPerRange; + BlockNumber bs_currRangeStart; + BrinRevmap *bs_rmAccess; + BrinDesc *bs_bdesc; + BrinMemTuple *bs_dtuple; +} BrinBuildState; + +/* + * Struct used as "opaque" during index scans + */ +typedef struct BrinOpaque +{ + BlockNumber bo_pagesPerRange; + BrinRevmap *bo_rmAccess; + BrinDesc *bo_bdesc; +} BrinOpaque; + +PG_FUNCTION_INFO_V1(brin_summarize_new_values); + +static BrinBuildState *initialize_brin_buildstate(Relation idxRel, + BrinRevmap *revmap, BlockNumber pagesPerRange); +static void terminate_brin_buildstate(BrinBuildState *state); +static void brinsummarize(Relation index, Relation heapRel, + double *numSummarized, double *numExisting); +static void form_and_insert_tuple(BrinBuildState *state); +static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, + BrinTuple *b); + + +/* + * A tuple in the heap is being inserted. To keep a brin index up to date, + * we need to obtain the relevant index tuple and compare its stored values + * with those of the new tuple. If the tuple values are not consistent with + * the summary tuple, we need to update the index tuple. + * + * If the range is not currently summarized (i.e. the revmap returns NULL for + * it), there's nothing to do. + */ +Datum +brininsert(PG_FUNCTION_ARGS) +{ + Relation idxRel = (Relation) PG_GETARG_POINTER(0); + Datum *values = (Datum *) PG_GETARG_POINTER(1); + bool *nulls = (bool *) PG_GETARG_POINTER(2); + ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3); + + /* we ignore the rest of our arguments */ + BlockNumber pagesPerRange; + BrinDesc *bdesc = NULL; + BrinRevmap *revmap; + Buffer buf = InvalidBuffer; + MemoryContext tupcxt = NULL; + MemoryContext oldcxt = NULL; + + revmap = brinRevmapInitialize(idxRel, &pagesPerRange); + + for (;;) + { + bool need_insert = false; + OffsetNumber off; + BrinTuple *brtup; + BrinMemTuple *dtup; + BlockNumber heapBlk; + int keyno; + BrinTuple *tmptup PG_USED_FOR_ASSERTS_ONLY; + BrinMemTuple *tmpdtup PG_USED_FOR_ASSERTS_ONLY; + Size tmpsiz PG_USED_FOR_ASSERTS_ONLY; + + CHECK_FOR_INTERRUPTS(); + + heapBlk = ItemPointerGetBlockNumber(heaptid); + /* normalize the block number to be the first block in the range */ + heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; + brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, + BUFFER_LOCK_SHARE); + + /* if range is unsummarized, there's nothing to do */ + if (!brtup) + break; + + /* First time through? */ + if (bdesc == NULL) + { + bdesc = brin_build_desc(idxRel); + tupcxt = AllocSetContextCreate(CurrentMemoryContext, + "brininsert cxt", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcxt = MemoryContextSwitchTo(tupcxt); + } + + dtup = brin_deform_tuple(bdesc, brtup); + +#ifdef USE_ASSERT_CHECKING + { + /* + * When assertions are enabled, we use this as an opportunity to + * test the "union" method, which would otherwise be used very + * rarely: first create a placeholder tuple, and addValue the + * value we just got into it. Then union the existing index tuple + * with the updated placeholder tuple. The tuple resulting from + * that union should be identical to the one resulting from the + * regular operation (straight addValue) below. + * + * Here we create the tuple to compare with; the actual comparison + * is below. + */ + tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz); + tmpdtup = brin_deform_tuple(bdesc, tmptup); + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + BrinValues *bval; + FmgrInfo *addValue; + + bval = &tmpdtup->bt_columns[keyno]; + addValue = index_getprocinfo(idxRel, keyno + 1, + BRIN_PROCNUM_ADDVALUE); + FunctionCall4Coll(addValue, + idxRel->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(bval), + values[keyno], + nulls[keyno]); + } + + union_tuples(bdesc, tmpdtup, brtup); + + tmpdtup->bt_placeholder = dtup->bt_placeholder; + tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz); + } +#endif + + /* + * Compare the key values of the new tuple to the stored index values; + * our deformed tuple will get updated if the new tuple doesn't fit + * the original range (note this means we can't break out of the loop + * early). Make a note of whether this happens, so that we know to + * insert the modified tuple later. + */ + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + Datum result; + BrinValues *bval; + FmgrInfo *addValue; + + bval = &dtup->bt_columns[keyno]; + addValue = index_getprocinfo(idxRel, keyno + 1, + BRIN_PROCNUM_ADDVALUE); + result = FunctionCall4Coll(addValue, + idxRel->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(bval), + values[keyno], + nulls[keyno]); + /* if that returned true, we need to insert the updated tuple */ + need_insert |= DatumGetBool(result); + } + +#ifdef USE_ASSERT_CHECKING + { + /* + * Now we can compare the tuple produced by the union function + * with the one from plain addValue. + */ + BrinTuple *cmptup; + Size cmpsz; + + cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz); + Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz)); + } +#endif + + if (!need_insert) + { + /* + * The tuple is consistent with the new values, so there's nothing + * to do. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + { + Page page = BufferGetPage(buf); + ItemId lp = PageGetItemId(page, off); + Size origsz; + BrinTuple *origtup; + Size newsz; + BrinTuple *newtup; + bool samepage; + + /* + * Make a copy of the old tuple, so that we can compare it after + * re-acquiring the lock. + */ + origsz = ItemIdGetLength(lp); + origtup = brin_copy_tuple(brtup, origsz); + + /* + * Before releasing the lock, check if we can attempt a same-page + * update. Another process could insert a tuple concurrently in + * the same page though, so downstream we must be prepared to cope + * if this turns out to not be possible after all. + */ + samepage = brin_can_do_samepage_update(buf, origsz, newsz); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); + + /* + * Try to update the tuple. If this doesn't work for whatever + * reason, we need to restart from the top; the revmap might be + * pointing at a different tuple for this block now, so we need to + * recompute to ensure both our new heap tuple and the other + * inserter's are covered by the combined tuple. It might be that + * we don't need to update at all. + */ + if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, + buf, off, origtup, origsz, newtup, newsz, + samepage)) + { + /* no luck; start over */ + MemoryContextResetAndDeleteChildren(tupcxt); + continue; + } + } + + /* success! */ + break; + } + + brinRevmapTerminate(revmap); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + if (bdesc != NULL) + { + brin_free_desc(bdesc); + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(tupcxt); + } + + return BoolGetDatum(false); +} + +/* + * Initialize state for a BRIN index scan. + * + * We read the metapage here to determine the pages-per-range number that this + * index was built with. Note that since this cannot be changed while we're + * holding lock on index, it's not necessary to recompute it during brinrescan. + */ +Datum +brinbeginscan(PG_FUNCTION_ARGS) +{ + Relation r = (Relation) PG_GETARG_POINTER(0); + int nkeys = PG_GETARG_INT32(1); + int norderbys = PG_GETARG_INT32(2); + IndexScanDesc scan; + BrinOpaque *opaque; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque)); + opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange); + opaque->bo_bdesc = brin_build_desc(r); + scan->opaque = opaque; + + PG_RETURN_POINTER(scan); +} + +/* + * Execute the index scan. + * + * This works by reading index TIDs from the revmap, and obtaining the index + * tuples pointed to by them; the summary values in the index tuples are + * compared to the scan keys. We return into the TID bitmap all the pages in + * ranges corresponding to index tuples that match the scan keys. + * + * If a TID from the revmap is read as InvalidTID, we know that range is + * unsummarized. Pages in those ranges need to be returned regardless of scan + * keys. + * + * XXX see _bt_first on what to do about sk_subtype. + */ +Datum +bringetbitmap(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); + Relation idxRel = scan->indexRelation; + Buffer buf = InvalidBuffer; + BrinDesc *bdesc; + Oid heapOid; + Relation heapRel; + BrinOpaque *opaque; + BlockNumber nblocks; + BlockNumber heapBlk; + int totalpages = 0; + int keyno; + FmgrInfo *consistentFn; + MemoryContext oldcxt; + MemoryContext perRangeCxt; + + opaque = (BrinOpaque *) scan->opaque; + bdesc = opaque->bo_bdesc; + pgstat_count_index_scan(idxRel); + + /* + * We need to know the size of the table so that we know how long to + * iterate on the revmap. + */ + heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); + heapRel = heap_open(heapOid, AccessShareLock); + nblocks = RelationGetNumberOfBlocks(heapRel); + heap_close(heapRel, AccessShareLock); + + /* + * Obtain consistent functions for all indexed column. Maybe it'd be + * possible to do this lazily only the first time we see a scan key that + * involves each particular attribute. + */ + consistentFn = palloc(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + FmgrInfo *tmp; + + tmp = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_CONSISTENT); + fmgr_info_copy(&consistentFn[keyno], tmp, CurrentMemoryContext); + } + + /* + * Setup and use a per-range memory context, which is reset every time we + * loop below. This avoids having to free the tuples within the loop. + */ + perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, + "bringetbitmap cxt", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcxt = MemoryContextSwitchTo(perRangeCxt); + + /* + * Now scan the revmap. We start by querying for heap page 0, + * incrementing by the number of pages per range; this gives us a full + * view of the table. + */ + for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) + { + bool addrange; + BrinTuple *tup; + OffsetNumber off; + Size size; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextResetAndDeleteChildren(perRangeCxt); + + tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, + &off, &size, BUFFER_LOCK_SHARE); + if (tup) + { + tup = brin_copy_tuple(tup, size); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* + * For page ranges with no indexed tuple, we must return the whole + * range; otherwise, compare it to the scan keys. + */ + if (tup == NULL) + { + addrange = true; + } + else + { + BrinMemTuple *dtup; + int keyno; + + dtup = brin_deform_tuple(bdesc, tup); + if (dtup->bt_placeholder) + { + /* + * Placeholder tuples are always returned, regardless of the + * values stored in them. + */ + addrange = true; + } + else + { + /* + * Compare scan keys with summary values stored for the range. + * If scan keys are matched, the page range must be added to + * the bitmap. We initially assume the range needs to be + * added; in particular this serves the case where there are + * no keys. + */ + addrange = true; + for (keyno = 0; keyno < scan->numberOfKeys; keyno++) + { + ScanKey key = &scan->keyData[keyno]; + AttrNumber keyattno = key->sk_attno; + BrinValues *bval = &dtup->bt_columns[keyattno - 1]; + Datum add; + + /* + * The collation of the scan key must match the collation + * used in the index column (but only if the search is not + * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using + * this index ... + */ + Assert((key->sk_flags & SK_ISNULL) || + (key->sk_collation == + bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation)); + + /* + * Check whether the scan key is consistent with the page + * range values; if so, have the pages in the range added + * to the output bitmap. + * + * When there are multiple scan keys, failure to meet the + * criteria for a single one of them is enough to discard + * the range as a whole, so break out of the loop as soon + * as a false return value is obtained. + */ + add = FunctionCall3Coll(&consistentFn[keyattno - 1], + key->sk_collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(key)); + addrange = DatumGetBool(add); + if (!addrange) + break; + } + } + } + + /* add the pages in the range to the output bitmap, if needed */ + if (addrange) + { + BlockNumber pageno; + + for (pageno = heapBlk; + pageno <= heapBlk + opaque->bo_pagesPerRange - 1; + pageno++) + { + MemoryContextSwitchTo(oldcxt); + tbm_add_page(tbm, pageno); + totalpages++; + MemoryContextSwitchTo(perRangeCxt); + } + } + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(perRangeCxt); + + if (buf != InvalidBuffer) + ReleaseBuffer(buf); + + /* + * XXX We have an approximation of the number of *pages* that our scan + * returns, but we don't have a precise idea of the number of heap tuples + * involved. + */ + PG_RETURN_INT64(totalpages * 10); +} + +/* + * Re-initialize state for a BRIN index scan + */ +Datum +brinrescan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); + + /* other arguments ignored */ + + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + + PG_RETURN_VOID(); +} + +/* + * Close down a BRIN index scan + */ +Datum +brinendscan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + BrinOpaque *opaque = (BrinOpaque *) scan->opaque; + + brinRevmapTerminate(opaque->bo_rmAccess); + brin_free_desc(opaque->bo_bdesc); + pfree(opaque); + + PG_RETURN_VOID(); +} + +Datum +brinmarkpos(PG_FUNCTION_ARGS) +{ + elog(ERROR, "BRIN does not support mark/restore"); + PG_RETURN_VOID(); +} + +Datum +brinrestrpos(PG_FUNCTION_ARGS) +{ + elog(ERROR, "BRIN does not support mark/restore"); + PG_RETURN_VOID(); +} + +/* + * Per-heap-tuple callback for IndexBuildHeapScan. + * + * Note we don't worry about the page range at the end of the table here; it is + * present in the build state struct after we're called the last time, but not + * inserted into the index. Caller must ensure to do so, if appropriate. + */ +static void +brinbuildCallback(Relation index, + HeapTuple htup, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *brstate) +{ + BrinBuildState *state = (BrinBuildState *) brstate; + BlockNumber thisblock; + int i; + + thisblock = ItemPointerGetBlockNumber(&htup->t_self); + + /* + * If we're in a block that belongs to a future range, summarize what we've + * got and start afresh. Note the scan might have skipped many pages, + * if they were devoid of live tuples; make sure to insert index tuples + * for those too. + */ + while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1) + { + + BRIN_elog(DEBUG2, "brinbuildCallback: completed a range: %u--%u", + state->bs_currRangeStart, + state->bs_currRangeStart + state->bs_pagesPerRange); + + /* create the index tuple and insert it */ + form_and_insert_tuple(state); + + /* set state to correspond to the next range */ + state->bs_currRangeStart += state->bs_pagesPerRange; + + /* re-initialize state for it */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + } + + /* Accumulate the current tuple into the running state */ + for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++) + { + FmgrInfo *addValue; + BrinValues *col; + + col = &state->bs_dtuple->bt_columns[i]; + addValue = index_getprocinfo(index, i + 1, + BRIN_PROCNUM_ADDVALUE); + + /* + * Update dtuple state, if and as necessary. + */ + FunctionCall4Coll(addValue, + state->bs_bdesc->bd_tupdesc->attrs[i]->attcollation, + PointerGetDatum(state->bs_bdesc), + PointerGetDatum(col), + values[i], isnull[i]); + } +} + +/* + * brinbuild() -- build a new BRIN index. + */ +Datum +brinbuild(PG_FUNCTION_ARGS) +{ + Relation heap = (Relation) PG_GETARG_POINTER(0); + Relation index = (Relation) PG_GETARG_POINTER(1); + IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); + IndexBuildResult *result; + double reltuples; + double idxtuples; + BrinRevmap *revmap; + BrinBuildState *state; + Buffer meta; + BlockNumber pagesPerRange; + + /* + * We expect to be called exactly once for any index relation. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + + meta = ReadBuffer(index, P_NEW); + Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); + LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); + + brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(meta); + + if (RelationNeedsWAL(index)) + { + xl_brin_createidx xlrec; + XLogRecPtr recptr; + XLogRecData rdata; + Page page; + + xlrec.node = index->rd_node; + xlrec.version = BRIN_CURRENT_VERSION; + xlrec.pagesPerRange = BrinGetPagesPerRange(index); + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) &xlrec; + rdata.len = SizeOfBrinCreateIdx; + rdata.next = NULL; + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX, &rdata); + + page = BufferGetPage(meta); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(meta); + + /* + * Initialize our state, including the deformed tuple state. + */ + revmap = brinRevmapInitialize(index, &pagesPerRange); + state = initialize_brin_buildstate(index, revmap, pagesPerRange); + + /* + * Now scan the relation. No syncscan allowed here because we want the + * heap blocks in physical order. + */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, + brinbuildCallback, (void *) state); + + /* process the final batch */ + form_and_insert_tuple(state); + + /* release resources */ + idxtuples = state->bs_numtuples; + brinRevmapTerminate(state->bs_rmAccess); + terminate_brin_buildstate(state); + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = idxtuples; + + PG_RETURN_POINTER(result); +} + +Datum +brinbuildempty(PG_FUNCTION_ARGS) +{ + + Relation index = (Relation) PG_GETARG_POINTER(0); + Buffer metabuf; + + /* An empty BRIN index has a metapage only. */ + metabuf = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog metabuffer. */ + START_CRIT_SECTION(); + brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(metabuf); + log_newpage_buffer(metabuf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuf); + + PG_RETURN_VOID(); +} + +/* + * brinbulkdelete + * Since there are no per-heap-tuple index tuples in BRIN indexes, + * there's not a lot we can do here. + * + * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap + * tuple is deleted), meaning the need to re-run summarization on the affected + * range. Need to an extra flag in mmtuples for that. + */ +Datum +brinbulkdelete(PG_FUNCTION_ARGS) +{ + /* other arguments are not currently used */ + IndexBulkDeleteResult *stats = + (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + PG_RETURN_POINTER(stats); +} + +/* + * This routine is in charge of "vacuuming" a BRIN index: we just summarize + * ranges that are currently unsummarized. + */ +Datum +brinvacuumcleanup(PG_FUNCTION_ARGS) +{ + IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); + IndexBulkDeleteResult *stats = + (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + Relation heapRel; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + PG_RETURN_POINTER(stats); + + if (!stats) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats->num_pages = RelationGetNumberOfBlocks(info->index); + /* rest of stats is initialized by zeroing */ + + heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false), + AccessShareLock); + + brinsummarize(info->index, heapRel, + &stats->num_index_tuples, &stats->num_index_tuples); + + heap_close(heapRel, AccessShareLock); + + PG_RETURN_POINTER(stats); +} + +/* + * reloptions processor for BRIN indexes + */ +Datum +brinoptions(PG_FUNCTION_ARGS) +{ + Datum reloptions = PG_GETARG_DATUM(0); + bool validate = PG_GETARG_BOOL(1); + relopt_value *options; + BrinOptions *rdopts; + int numoptions; + static const relopt_parse_elt tab[] = { + {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)} + }; + + options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN, + &numoptions); + + /* if none set, we're done */ + if (numoptions == 0) + PG_RETURN_NULL(); + + rdopts = allocateReloptStruct(sizeof(BrinOptions), options, numoptions); + + fillRelOptions((void *) rdopts, sizeof(BrinOptions), options, numoptions, + validate, tab, lengthof(tab)); + + pfree(options); + + PG_RETURN_BYTEA_P(rdopts); +} + +/* + * SQL-callable function to scan through an index and summarize all ranges + * that are not currently summarized. + */ +Datum +brin_summarize_new_values(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + Relation indexRel; + Relation heapRel; + double numSummarized = 0; + + heapRel = heap_open(IndexGetRelation(indexoid, false), + ShareUpdateExclusiveLock); + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + brinsummarize(indexRel, heapRel, &numSummarized, NULL); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_INT32((int32) numSummarized); +} + +/* + * Build a BrinDesc used to create or scan a BRIN index + */ +BrinDesc * +brin_build_desc(Relation rel) +{ + BrinOpcInfo **opcinfo; + BrinDesc *bdesc; + TupleDesc tupdesc; + int totalstored = 0; + int keyno; + long totalsize; + MemoryContext cxt; + MemoryContext oldcxt; + + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin desc cxt", + ALLOCSET_SMALL_INITSIZE, + ALLOCSET_SMALL_MINSIZE, + ALLOCSET_SMALL_MAXSIZE); + oldcxt = MemoryContextSwitchTo(cxt); + tupdesc = RelationGetDescr(rel); + + /* + * Obtain BrinOpcInfo for each indexed column. While at it, accumulate + * the number of columns stored, since the number is opclass-defined. + */ + opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts); + for (keyno = 0; keyno < tupdesc->natts; keyno++) + { + FmgrInfo *opcInfoFn; + + opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO); + + opcinfo[keyno] = (BrinOpcInfo *) + DatumGetPointer(FunctionCall1(opcInfoFn, + tupdesc->attrs[keyno]->atttypid)); + totalstored += opcinfo[keyno]->oi_nstored; + } + + /* Allocate our result struct and fill it in */ + totalsize = offsetof(BrinDesc, bd_info) + + sizeof(BrinOpcInfo *) * tupdesc->natts; + + bdesc = palloc(totalsize); + bdesc->bd_context = cxt; + bdesc->bd_index = rel; + bdesc->bd_tupdesc = tupdesc; + bdesc->bd_disktdesc = NULL; /* generated lazily */ + bdesc->bd_totalstored = totalstored; + + for (keyno = 0; keyno < tupdesc->natts; keyno++) + bdesc->bd_info[keyno] = opcinfo[keyno]; + pfree(opcinfo); + + MemoryContextSwitchTo(oldcxt); + + return bdesc; +} + +void +brin_free_desc(BrinDesc *bdesc) +{ + /* make sure the tupdesc is still valid */ + Assert(bdesc->bd_tupdesc->tdrefcount >= 1); + /* no need for retail pfree */ + MemoryContextDelete(bdesc->bd_context); +} + +/* + * Initialize a BrinBuildState appropriate to create tuples on the given index. + */ +static BrinBuildState * +initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, + BlockNumber pagesPerRange) +{ + BrinBuildState *state; + + state = palloc(sizeof(BrinBuildState)); + + state->bs_irel = idxRel; + state->bs_numtuples = 0; + state->bs_currentInsertBuf = InvalidBuffer; + state->bs_pagesPerRange = pagesPerRange; + state->bs_currRangeStart = 0; + state->bs_rmAccess = revmap; + state->bs_bdesc = brin_build_desc(idxRel); + state->bs_dtuple = brin_new_memtuple(state->bs_bdesc); + + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + return state; +} + +/* + * Release resources associated with a BrinBuildState. + */ +static void +terminate_brin_buildstate(BrinBuildState *state) +{ + /* release the last index buffer used */ + if (!BufferIsInvalid(state->bs_currentInsertBuf)) + { + Page page; + + page = BufferGetPage(state->bs_currentInsertBuf); + RecordPageWithFreeSpace(state->bs_irel, + BufferGetBlockNumber(state->bs_currentInsertBuf), + PageGetFreeSpace(page)); + ReleaseBuffer(state->bs_currentInsertBuf); + } + + brin_free_desc(state->bs_bdesc); + pfree(state->bs_dtuple); + pfree(state); +} + +/* + * Summarize the given page range of the given index. + * + * This routine can run in parallel with insertions into the heap. To avoid + * missing those values from the summary tuple, we first insert a placeholder + * index tuple into the index, then execute the heap scan; transactions + * concurrent with the scan update the placeholder tuple. After the scan, we + * union the placeholder tuple with the one computed by this routine. The + * update of the index value happens in a loop, so that if somebody updates + * the placeholder tuple after we read it, we detect the case and try again. + * This ensures that the concurrently inserted tuples are not lost. + */ +static void +summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, + BlockNumber heapBlk) +{ + Buffer phbuf; + BrinTuple *phtup; + Size phsz; + OffsetNumber offset; + + /* + * Insert the placeholder tuple + */ + phbuf = InvalidBuffer; + phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); + offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, &phbuf, + heapBlk, phtup, phsz); + + /* + * Execute the partial heap scan covering the heap blocks in the specified + * page range, summarizing the heap tuples in it. This scan stops just + * short of brinbuildCallback creating the new index entry. + */ + state->bs_currRangeStart = heapBlk; + IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, + heapBlk, state->bs_pagesPerRange, + brinbuildCallback, (void *) state); + + /* + * Now we update the values obtained by the scan with the placeholder + * tuple. We do this in a loop which only terminates if we're able to + * update the placeholder tuple successfully; if we are not, this means + * somebody else modified the placeholder tuple after we read it. + */ + for (;;) + { + BrinTuple *newtup; + Size newsize; + bool didupdate; + bool samepage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Update the summary tuple and try to update. + */ + newtup = brin_form_tuple(state->bs_bdesc, + heapBlk, state->bs_dtuple, &newsize); + samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); + didupdate = + brin_doupdate(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, heapBlk, phbuf, offset, + phtup, phsz, newtup, newsize, samepage); + brin_free_tuple(phtup); + brin_free_tuple(newtup); + + /* If the update succeeded, we're done. */ + if (didupdate) + break; + + /* + * If the update didn't work, it might be because somebody updated the + * placeholder tuple concurrently. Extract the new version, union it + * with the values we have from the scan, and start over. (There are + * other reasons for the update to fail, but it's simple to treat them + * the same.) + */ + phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, + &offset, &phsz, BUFFER_LOCK_SHARE); + /* the placeholder tuple must exist */ + if (phtup == NULL) + elog(ERROR, "missing placeholder tuple"); + phtup = brin_copy_tuple(phtup, phsz); + LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); + + /* merge it into the tuple from the heap scan */ + union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); + } + + ReleaseBuffer(phbuf); +} + +/* + * Scan a complete BRIN index, and summarize each page range that's not already + * summarized. The index and heap must have been locked by caller in at + * least ShareUpdateExclusiveLock mode. + * + * For each new index tuple inserted, *numSummarized (if not NULL) is + * incremented; for each existing tuple, numExisting (if not NULL) is + * incremented. + */ +static void +brinsummarize(Relation index, Relation heapRel, double *numSummarized, + double *numExisting) +{ + BrinRevmap *revmap; + BrinBuildState *state = NULL; + IndexInfo *indexInfo = NULL; + BlockNumber heapNumBlocks; + BlockNumber heapBlk; + BlockNumber pagesPerRange; + Buffer buf; + + revmap = brinRevmapInitialize(index, &pagesPerRange); + + /* + * Scan the revmap to find unsummarized items. + */ + buf = InvalidBuffer; + heapNumBlocks = RelationGetNumberOfBlocks(heapRel); + for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange) + { + BrinTuple *tup; + OffsetNumber off; + + CHECK_FOR_INTERRUPTS(); + + tup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, + BUFFER_LOCK_SHARE); + if (tup == NULL) + { + /* no revmap entry for this heap range. Summarize it. */ + if (state == NULL) + { + /* first time through */ + Assert(!indexInfo); + state = initialize_brin_buildstate(index, revmap, + pagesPerRange); + indexInfo = BuildIndexInfo(index); + + /* + * We only have ShareUpdateExclusiveLock on the table, and + * therefore other sessions may insert tuples into the range + * we're going to scan. This is okay, because we take + * additional precautions to avoid losing the additional + * tuples; see comments in summarize_range. Set the + * concurrent flag, which causes IndexBuildHeapRangeScan to + * use a snapshot other than SnapshotAny, and silences + * warnings emitted there. + */ + indexInfo->ii_Concurrent = true; + + /* + * If using transaction-snapshot mode, it would be possible + * for another transaction to insert a tuple that's not + * visible to our snapshot if we have already acquired one, + * when in snapshot-isolation mode; therefore, disallow this + * from running in such a transaction unless a snapshot hasn't + * been acquired yet. + * + * This code is called by VACUUM and + * brin_summarize_new_values. Have the error message mention + * the latter because VACUUM cannot run in a transaction and + * thus cannot cause this issue. + */ + if (IsolationUsesXactSnapshot() && FirstSnapshotSet) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("brin_summarize_new_values() cannot run in a transaction that has already obtained a snapshot"))); + } + summarize_range(indexInfo, state, heapRel, heapBlk); + + /* and re-initialize state for the next range */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + if (numSummarized) + *numSummarized += 1.0; + } + else + { + if (numExisting) + *numExisting += 1.0; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + } + + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + /* free resources */ + brinRevmapTerminate(revmap); + if (state) + terminate_brin_buildstate(state); +} + +/* + * Given a deformed tuple in the build state, convert it into the on-disk + * format and insert it into the index, making the revmap point to it. + */ +static void +form_and_insert_tuple(BrinBuildState *state) +{ + BrinTuple *tup; + Size size; + + tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, + state->bs_dtuple, &size); + brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, + &state->bs_currentInsertBuf, state->bs_currRangeStart, + tup, size); + state->bs_numtuples++; + + pfree(tup); +} + +/* + * Given two deformed tuples, adjust the first one so that it's consistent + * with the summary values in both. + */ +static void +union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) +{ + int keyno; + BrinMemTuple *db; + MemoryContext cxt; + MemoryContext oldcxt; + + /* Use our own memory context to avoid retail pfree */ + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin union", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcxt = MemoryContextSwitchTo(cxt); + db = brin_deform_tuple(bdesc, b); + MemoryContextSwitchTo(oldcxt); + + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + FmgrInfo *unionFn; + BrinValues *col_a = &a->bt_columns[keyno]; + BrinValues *col_b = &db->bt_columns[keyno]; + + unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1, + BRIN_PROCNUM_UNION); + FunctionCall3Coll(unionFn, + bdesc->bd_index->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(col_a), + PointerGetDatum(col_b)); + } + + MemoryContextDelete(cxt); +} diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c new file mode 100644 index 00000000000..3a2bee2649e --- /dev/null +++ b/src/backend/access/brin/brin_minmax.c @@ -0,0 +1,341 @@ +/* + * brin_minmax.c + * Implementation of Min/Max opclass for BRIN + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_minmax.c + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/skey.h" +#include "catalog/pg_type.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + + +/* + * Procedure numbers must not collide with BRIN_PROCNUM defines in + * brin_internal.h. Note we only need inequality functions. + */ +#define MINMAX_NUM_PROCNUMS 4 /* # support procs we need */ +#define PROCNUM_LESS 11 +#define PROCNUM_LESSEQUAL 12 +#define PROCNUM_GREATEREQUAL 13 +#define PROCNUM_GREATER 14 + +/* + * Subtract this from procnum to obtain index in MinmaxOpaque arrays + * (Must be equal to minimum of private procnums) + */ +#define PROCNUM_BASE 11 + +static FmgrInfo *minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); + +PG_FUNCTION_INFO_V1(minmaxOpcInfo); +PG_FUNCTION_INFO_V1(minmaxAddValue); +PG_FUNCTION_INFO_V1(minmaxConsistent); +PG_FUNCTION_INFO_V1(minmaxUnion); + + +typedef struct MinmaxOpaque +{ + FmgrInfo operators[MINMAX_NUM_PROCNUMS]; + bool inited[MINMAX_NUM_PROCNUMS]; +} MinmaxOpaque; + +Datum +minmaxOpcInfo(PG_FUNCTION_ARGS) +{ + Oid typoid = PG_GETARG_OID(0); + BrinOpcInfo *result; + + /* + * opaque->operators is initialized lazily, as indicated by 'inited' which + * is initialized to all false by palloc0. + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) + + sizeof(MinmaxOpaque)); + result->oi_nstored = 2; + result->oi_opaque = (MinmaxOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(2)); + result->oi_typids[0] = typoid; + result->oi_typids[1] = typoid; + + PG_RETURN_POINTER(result); +} + +/* + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the min/max range specified by the + * existing tuple values, update the index tuple and return true. Otherwise, + * return false and do not modify in this case. + */ +Datum +minmaxAddValue(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull = PG_GETARG_DATUM(3); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *cmpFn; + Datum compar; + bool updated = false; + Form_pg_attribute attr; + AttrNumber attno; + + /* + * If the new value is null, we record that we saw it if it's the first + * one; otherwise, there's nothing to do. + */ + if (isnull) + { + if (column->bv_hasnulls) + PG_RETURN_BOOL(false); + + column->bv_hasnulls = true; + PG_RETURN_BOOL(true); + } + + attno = column->bv_attno; + attr = bdesc->bd_tupdesc->attrs[attno - 1]; + + /* + * If the recorded value is null, store the new value (which we know to be + * not null) as both minimum and maximum, and we're done. + */ + if (column->bv_allnulls) + { + column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_allnulls = false; + PG_RETURN_BOOL(true); + } + + /* + * Otherwise, need to compare the new value with the existing boundaries + * and update them accordingly. First check if it's less than the + * existing minimum. + */ + cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_LESS); + compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]); + if (DatumGetBool(compar)) + { + if (!attr->attbyval) + pfree(DatumGetPointer(column->bv_values[0])); + column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); + updated = true; + } + + /* + * And now compare it to the existing maximum. + */ + cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_GREATER); + compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]); + if (DatumGetBool(compar)) + { + if (!attr->attbyval) + pfree(DatumGetPointer(column->bv_values[1])); + column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); + updated = true; + } + + PG_RETURN_BOOL(updated); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's min/max + * values. Return true if so, false otherwise. + */ +Datum +minmaxConsistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Datum value; + Datum matches; + + Assert(key->sk_attno == column->bv_attno); + + /* handle IS NULL/IS NOT NULL tests */ + if (key->sk_flags & SK_ISNULL) + { + if (key->sk_flags & SK_SEARCHNULL) + { + if (column->bv_allnulls || column->bv_hasnulls) + PG_RETURN_BOOL(true); + PG_RETURN_BOOL(false); + } + + /* + * For IS NOT NULL, we can only skip ranges that are known to have + * only nulls. + */ + Assert(key->sk_flags & SK_SEARCHNOTNULL); + PG_RETURN_BOOL(!column->bv_allnulls); + } + + /* if the range is all empty, it cannot possibly be consistent */ + if (column->bv_allnulls) + PG_RETURN_BOOL(false); + + attno = key->sk_attno; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_LESS), + colloid, column->bv_values[0], value); + break; + case BTLessEqualStrategyNumber: + matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_LESSEQUAL), + colloid, column->bv_values[0], value); + break; + case BTEqualStrategyNumber: + + /* + * In the equality case (WHERE col = someval), we want to return + * the current page range if the minimum value in the range <= + * scan key, and the maximum value >= scan key. + */ + matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_LESSEQUAL), + colloid, column->bv_values[0], value); + if (!DatumGetBool(matches)) + break; + /* max() >= scankey */ + matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_GREATEREQUAL), + colloid, column->bv_values[1], value); + break; + case BTGreaterEqualStrategyNumber: + matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_GREATEREQUAL), + colloid, column->bv_values[1], value); + break; + case BTGreaterStrategyNumber: + matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_GREATER), + colloid, column->bv_values[1], value); + break; + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + PG_RETURN_DATUM(matches); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +minmaxUnion(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Form_pg_attribute attr; + bool needsadj; + + Assert(col_a->bv_attno == col_b->bv_attno); + + /* If there are no values in B, there's nothing to do */ + if (col_b->bv_allnulls) + PG_RETURN_VOID(); + + attno = col_a->bv_attno; + attr = bdesc->bd_tupdesc->attrs[attno - 1]; + + /* Adjust "hasnulls" */ + if (col_b->bv_hasnulls && !col_a->bv_hasnulls) + col_a->bv_hasnulls = true; + + /* + * Adjust "allnulls". If B has values but A doesn't, just copy the values + * from B into A, and we're done. (We cannot run the operators in this + * case, because values in A might contain garbage.) + */ + if (!col_b->bv_allnulls && col_a->bv_allnulls) + { + col_a->bv_allnulls = false; + col_a->bv_values[0] = datumCopy(col_b->bv_values[0], + attr->attbyval, attr->attlen); + col_a->bv_values[1] = datumCopy(col_b->bv_values[1], + attr->attbyval, attr->attlen); + PG_RETURN_VOID(); + } + + /* Adjust minimum, if B's min is less than A's min */ + needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_LESS), + colloid, col_b->bv_values[0], col_a->bv_values[0]); + if (needsadj) + { + if (!attr->attbyval) + pfree(DatumGetPointer(col_a->bv_values[0])); + col_a->bv_values[0] = datumCopy(col_b->bv_values[0], + attr->attbyval, attr->attlen); + } + + /* Adjust maximum, if B's max is greater than A's max */ + needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, + PROCNUM_GREATER), + colloid, col_b->bv_values[1], col_a->bv_values[1]); + if (needsadj) + { + if (!attr->attbyval) + pfree(DatumGetPointer(col_a->bv_values[1])); + col_a->bv_values[1] = datumCopy(col_b->bv_values[1], + attr->attbyval, attr->attlen); + } + + PG_RETURN_VOID(); +} + +/* + * Return the procedure corresponding to the given function support number. + */ +static FmgrInfo * +minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + MinmaxOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + if (!opaque->inited[basenum]) + { + fmgr_info_copy(&opaque->operators[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + opaque->inited[basenum] = true; + } + + return &opaque->operators[basenum]; +} diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c new file mode 100644 index 00000000000..c34b86c94c7 --- /dev/null +++ b/src/backend/access/brin/brin_pageops.c @@ -0,0 +1,723 @@ +/* + * brin_pageops.c + * Page-handling routines for BRIN indexes + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_pageops.c + */ +#include "postgres.h" + +#include "access/brin_pageops.h" +#include "access/brin_page.h" +#include "access/brin_revmap.h" +#include "access/brin_xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/rel.h" + + +static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *was_extended); +static Size br_page_get_freespace(Page page); + + +/* + * Update tuple origtup (size origsz), located in offset oldoff of buffer + * oldbuf, to newtup (size newsz) as summary tuple for the page range starting + * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. + * + * If samepage is true, attempt to put the new tuple in the same page, but if + * there's no room, use some other one. + * + * If the update is successful, return true; the revmap is updated to point to + * the new tuple. If the update is not done for whatever reason, return false. + * Caller may retry the update if this happens. + */ +bool +brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, BlockNumber heapBlk, + Buffer oldbuf, OffsetNumber oldoff, + const BrinTuple *origtup, Size origsz, + const BrinTuple *newtup, Size newsz, + bool samepage) +{ + Page oldpage; + ItemId oldlp; + BrinTuple *oldtup; + Size oldsz; + Buffer newbuf; + BrinSpecialSpace *special; + bool extended = false; + + newsz = MAXALIGN(newsz); + + /* make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + if (!samepage) + { + /* need a page on which to put the item */ + newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); + /* XXX delay vacuuming FSM until locks are released? */ + if (extended) + FreeSpaceMapVacuum(idxrel); + if (!BufferIsValid(newbuf)) + return false; + + /* + * Note: it's possible (though unlikely) that the returned newbuf is + * the same as oldbuf, if brin_getinsertbuffer determined that the old + * buffer does in fact have enough space. + */ + if (newbuf == oldbuf) + newbuf = InvalidBuffer; + } + else + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + newbuf = InvalidBuffer; + } + oldpage = BufferGetPage(oldbuf); + oldlp = PageGetItemId(oldpage, oldoff); + + /* + * Check that the old tuple wasn't updated concurrently: it might have + * moved someplace else entirely ... + */ + if (!ItemIdIsNormal(oldlp)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); + return false; + } + + oldsz = ItemIdGetLength(oldlp); + oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); + + /* + * ... or it might have been updated in place to different contents. + */ + if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); + return false; + } + + special = (BrinSpecialSpace *) PageGetSpecialPointer(oldpage); + + /* + * Great, the old tuple is intact. We can proceed with the update. + * + * If there's enough room in the old page for the new tuple, replace it. + * + * Note that there might now be enough space on the page even though the + * caller told us there isn't, if a concurrent update moved another tuple + * elsewhere or replaced a tuple with a smaller one. + */ + if (((special->flags & BRIN_EVACUATE_PAGE) == 0) && + brin_can_do_samepage_update(oldbuf, origsz, newsz)) + { + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); + + START_CRIT_SECTION(); + PageIndexDeleteNoCompact(oldpage, &oldoff, 1); + if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true, + false) == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple"); + MarkBufferDirty(oldbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + BlockNumber blk = BufferGetBlockNumber(oldbuf); + xl_brin_samepage_update xlrec; + XLogRecPtr recptr; + XLogRecData rdata[2]; + uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; + + xlrec.node = idxrel->rd_node; + ItemPointerSetBlockNumber(&xlrec.tid, blk); + ItemPointerSetOffsetNumber(&xlrec.tid, oldoff); + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBrinSamepageUpdate; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + rdata[1].data = (char *) newtup; + rdata[1].len = newsz; + rdata[1].buffer = oldbuf; + rdata[1].buffer_std = true; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_BRIN_ID, info, rdata); + + PageSetLSN(oldpage, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + return true; + } + else if (newbuf == InvalidBuffer) + { + /* + * Not enough space, but caller said that there was. Tell them to + * start over. + */ + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + return false; + } + else + { + /* + * Not enough free space on the oldpage. Put the new tuple on the new + * page, and update the revmap. + */ + Page newpage = BufferGetPage(newbuf); + Buffer revmapbuf; + ItemPointerData newtid; + OffsetNumber newoff; + + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + START_CRIT_SECTION(); + + PageIndexDeleteNoCompact(oldpage, &oldoff, 1); + newoff = PageAddItem(newpage, (Item) newtup, newsz, + InvalidOffsetNumber, false, false); + if (newoff == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(oldbuf); + MarkBufferDirty(newbuf); + + ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_update xlrec; + XLogRecPtr recptr; + XLogRecData rdata[4]; + uint8 info; + + info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); + + xlrec.new.node = idxrel->rd_node; + ItemPointerSet(&xlrec.new.tid, BufferGetBlockNumber(newbuf), newoff); + xlrec.new.heapBlk = heapBlk; + xlrec.new.tuplen = newsz; + xlrec.new.revmapBlk = BufferGetBlockNumber(revmapbuf); + xlrec.new.pagesPerRange = pagesPerRange; + ItemPointerSet(&xlrec.oldtid, BufferGetBlockNumber(oldbuf), oldoff); + + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBrinUpdate; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + rdata[1].data = (char *) newtup; + rdata[1].len = newsz; + rdata[1].buffer = extended ? InvalidBuffer : newbuf; + rdata[1].buffer_std = true; + rdata[1].next = &(rdata[2]); + + rdata[2].data = (char *) NULL; + rdata[2].len = 0; + rdata[2].buffer = revmapbuf; + rdata[2].buffer_std = true; + rdata[2].next = &(rdata[3]); + + rdata[3].data = (char *) NULL; + rdata[3].len = 0; + rdata[3].buffer = oldbuf; + rdata[3].buffer_std = true; + rdata[3].next = NULL; + + recptr = XLogInsert(RM_BRIN_ID, info, rdata); + + PageSetLSN(oldpage, recptr); + PageSetLSN(newpage, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(newbuf); + return true; + } +} + +/* + * Return whether brin_doupdate can do a samepage update. + */ +bool +brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) +{ + return + ((newsz <= origsz) || + PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); +} + +/* + * Insert an index tuple into the index relation. The revmap is updated to + * mark the range containing the given page as pointing to the inserted entry. + * A WAL record is written. + * + * The buffer, if valid, is first checked for free space to insert the new + * entry; if there isn't enough, a new buffer is obtained and pinned. No + * buffer lock must be held on entry, no buffer lock is held on exit. + * + * Return value is the offset number where the tuple was inserted. + */ +OffsetNumber +brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, + BrinTuple *tup, Size itemsz) +{ + Page page; + BlockNumber blk; + OffsetNumber off; + Buffer revmapbuf; + ItemPointerData tid; + bool extended = false; + + itemsz = MAXALIGN(itemsz); + + /* Make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + /* + * Obtain a locked buffer to insert the new tuple. Note + * brin_getinsertbuffer ensures there's enough space in the returned + * buffer. + */ + if (BufferIsValid(*buffer)) + { + /* + * It's possible that another backend (or ourselves!) extended the + * revmap over the page we held a pin on, so we cannot assume that + * it's still a regular page. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) + { + UnlockReleaseBuffer(*buffer); + *buffer = InvalidBuffer; + } + } + + if (!BufferIsValid(*buffer)) + { + *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); + Assert(BufferIsValid(*buffer)); + Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz); + } + + /* Now obtain lock on revmap buffer */ + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + page = BufferGetPage(*buffer); + blk = BufferGetBlockNumber(*buffer); + + START_CRIT_SECTION(); + off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, + false, false); + if (off == InvalidOffsetNumber) + elog(ERROR, "could not insert new index tuple to page"); + MarkBufferDirty(*buffer); + + BRIN_elog(DEBUG2, "inserted tuple (%u,%u) for range starting at %u", + blk, off, heapBlk); + + ItemPointerSet(&tid, blk, off); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_insert xlrec; + XLogRecPtr recptr; + XLogRecData rdata[3]; + uint8 info; + + info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); + xlrec.node = idxrel->rd_node; + xlrec.heapBlk = heapBlk; + xlrec.pagesPerRange = pagesPerRange; + xlrec.revmapBlk = BufferGetBlockNumber(revmapbuf); + xlrec.tuplen = itemsz; + ItemPointerSet(&xlrec.tid, blk, off); + + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBrinInsert; + rdata[0].buffer = InvalidBuffer; + rdata[0].buffer_std = false; + rdata[0].next = &(rdata[1]); + + rdata[1].data = (char *) tup; + rdata[1].len = itemsz; + rdata[1].buffer = extended ? InvalidBuffer : *buffer; + rdata[1].buffer_std = true; + rdata[1].next = &(rdata[2]); + + rdata[2].data = (char *) NULL; + rdata[2].len = 0; + rdata[2].buffer = revmapbuf; + rdata[2].buffer_std = false; + rdata[2].next = NULL; + + recptr = XLogInsert(RM_BRIN_ID, info, rdata); + + PageSetLSN(page, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + /* Tuple is firmly on buffer; we can release our locks */ + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + + if (extended) + FreeSpaceMapVacuum(idxrel); + + return off; +} + +/* + * Initialize a page with the given type. + * + * Caller is responsible for marking it dirty, as appropriate. + */ +void +brin_page_init(Page page, uint16 type) +{ + BrinSpecialSpace *special; + + PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); + + special = (BrinSpecialSpace *) PageGetSpecialPointer(page); + special->type = type; +} + +/* + * Initialize a new BRIN index' metapage. + */ +void +brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) +{ + BrinMetaPageData *metadata; + + brin_page_init(page, BRIN_PAGETYPE_META); + + metadata = (BrinMetaPageData *) PageGetContents(page); + + metadata->brinMagic = BRIN_META_MAGIC; + metadata->brinVersion = version; + metadata->pagesPerRange = pagesPerRange; + + /* + * Note we cheat here a little. 0 is not a valid revmap block number + * (because it's the metapage buffer), but doing this enables the first + * revmap page to be created when the index is. + */ + metadata->lastRevmapPage = 0; +} + +/* + * Initiate page evacuation protocol. + * + * The page must be locked in exclusive mode by the caller. + * + * If the page is not yet initialized or empty, return false without doing + * anything; it can be used for revmap without any further changes. If it + * contains tuples, mark it for evacuation and return true. + */ +bool +brin_start_evacuating_page(Relation idxRel, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + BrinSpecialSpace *special; + Page page; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + return false; + + special = (BrinSpecialSpace *) PageGetSpecialPointer(page); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId lp; + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + /* prevent other backends from adding more stuff to this page */ + special->flags |= BRIN_EVACUATE_PAGE; + MarkBufferDirtyHint(buf, true); + + return true; + } + } + return false; +} + +/* + * Move all tuples out of a page. + * + * The caller must hold lock on the page. The lock and pin are released. + */ +void +brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + + page = BufferGetPage(buf); + + Assert(((BrinSpecialSpace *) + PageGetSpecialPointer(page))->flags & BRIN_EVACUATE_PAGE); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + BrinTuple *tup; + Size sz; + ItemId lp; + + CHECK_FOR_INTERRUPTS(); + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + sz = ItemIdGetLength(lp); + tup = (BrinTuple *) PageGetItem(page, lp); + tup = brin_copy_tuple(tup, sz); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, + buf, off, tup, sz, tup, sz, false)) + off--; /* retry */ + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* It's possible that someone extended the revmap over this page */ + if (!BRIN_IS_REGULAR_PAGE(page)) + break; + } + } + + UnlockReleaseBuffer(buf); +} + +/* + * Return a pinned and exclusively locked buffer which can be used to insert an + * index item of size itemsz. If oldbuf is a valid buffer, it is also locked + * (in a order determined to avoid deadlocks.) + * + * If there's no existing page with enough free space to accomodate the new + * item, the relation is extended. If this happens, *extended is set to true. + * + * If we find that the old page is no longer a regular index page (because + * of a revmap extension), the old buffer is unlocked and we return + * InvalidBuffer. + */ +static Buffer +brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *was_extended) +{ + BlockNumber oldblk; + BlockNumber newblk; + Page page; + int freespace; + + if (BufferIsValid(oldbuf)) + oldblk = BufferGetBlockNumber(oldbuf); + else + oldblk = InvalidBlockNumber; + + /* + * Loop until we find a page with sufficient free space. By the time we + * return to caller out of this loop, both buffers are valid and locked; + * if we have to restart here, neither buffer is locked and buf is not a + * pinned buffer. + */ + newblk = RelationGetTargetBlock(irel); + if (newblk == InvalidBlockNumber) + newblk = GetPageWithFreeSpace(irel, itemsz); + for (;;) + { + Buffer buf; + bool extensionLockHeld = false; + bool extended = false; + + CHECK_FOR_INTERRUPTS(); + + if (newblk == InvalidBlockNumber) + { + /* + * There's not enough free space in any existing index page, + * according to the FSM: extend the relation to obtain a shiny new + * page. + */ + if (!RELATION_IS_LOCAL(irel)) + { + LockRelationForExtension(irel, ExclusiveLock); + extensionLockHeld = true; + } + buf = ReadBuffer(irel, P_NEW); + newblk = BufferGetBlockNumber(buf); + *was_extended = extended = true; + + BRIN_elog(DEBUG2, "brin_getinsertbuffer: extending to page %u", + BufferGetBlockNumber(buf)); + } + else if (newblk == oldblk) + { + /* + * There's an odd corner-case here where the FSM is out-of-date, + * and gave us the old page. + */ + buf = oldbuf; + } + else + { + buf = ReadBuffer(irel, newblk); + } + + /* + * We lock the old buffer first, if it's earlier than the new one; but + * before we do, we need to check that it hasn't been turned into a + * revmap page concurrently; if we detect that it happened, give up + * and tell caller to start over. + */ + if (BufferIsValid(oldbuf) && oldblk < newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + return InvalidBuffer; + } + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + page = BufferGetPage(buf); + + if (extended) + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + + /* + * We have a new buffer to insert into. Check that the new page has + * enough free space, and return it if it does; otherwise start over. + * Note that we allow for the FSM to be out of date here, and in that + * case we update it and move on. + * + * (br_page_get_freespace also checks that the FSM didn't hand us a + * page that has since been repurposed for the revmap.) + */ + freespace = br_page_get_freespace(page); + if (freespace >= itemsz) + { + RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); + + /* + * Since the target block specification can get lost on cache + * invalidations, make sure we update the more permanent FSM with + * data about it before going away. + */ + if (extended) + RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), + freespace); + + /* + * Lock the old buffer if not locked already. Note that in this + * case we know for sure it's a regular page: it's later than the + * new page we just got, which is not a revmap page, and revmap + * pages are always consecutive. + */ + if (BufferIsValid(oldbuf) && oldblk > newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); + } + + return buf; + } + + /* This page is no good. */ + + /* + * If an entirely new page does not contain enough free space for the + * new item, then surely that item is oversized. Complain loudly; but + * first make sure we record the page as free, for next time. + */ + if (extended) + { + RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), + freespace); + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", + (unsigned long) itemsz, + (unsigned long) freespace, + RelationGetRelationName(irel)))); + return InvalidBuffer; /* keep compiler quiet */ + } + + if (newblk != oldblk) + UnlockReleaseBuffer(buf); + if (BufferIsValid(oldbuf) && oldblk <= newblk) + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); + } +} + +/* + * Return the amount of free space on a regular BRIN index page. + * + * If the page is not a regular page, or has been marked with the + * BRIN_EVACUATE_PAGE flag, returns 0. + */ +static Size +br_page_get_freespace(Page page) +{ + BrinSpecialSpace *special; + + special = (BrinSpecialSpace *) PageGetSpecialPointer(page); + if (!BRIN_IS_REGULAR_PAGE(page) || + (special->flags & BRIN_EVACUATE_PAGE) != 0) + return 0; + else + return PageGetFreeSpace(page); +} diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c new file mode 100644 index 00000000000..b08a94b742d --- /dev/null +++ b/src/backend/access/brin/brin_revmap.c @@ -0,0 +1,510 @@ +/* + * brin_revmap.c + * Range map for BRIN indexes + * + * The range map (revmap) is a translation structure for BRIN indexes: for each + * page range there is one summary tuple, and its location is tracked by the + * revmap. Whenever a new tuple is inserted into a table that violates the + * previously recorded summary values, a new tuple is inserted into the index + * and the revmap is updated to point to it. + * + * The revmap is stored in the first pages of the index, immediately following + * the metapage. When the revmap needs to be expanded, all tuples on the + * regular BRIN page at that block (if any) are moved out of the way. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_revmap.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_revmap.h" +#include "access/brin_tuple.h" +#include "access/brin_xlog.h" +#include "access/rmgr.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/rel.h" + + +/* + * In revmap pages, each item stores an ItemPointerData. These defines let one + * find the logical revmap page number and index number of the revmap item for + * the given heap block number. + */ +#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \ + ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS) +#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \ + ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS) + + +struct BrinRevmap +{ + Relation rm_irel; + BlockNumber rm_pagesPerRange; + BlockNumber rm_lastRevmapPage; /* cached from the metapage */ + Buffer rm_metaBuf; + Buffer rm_currBuf; +}; + +/* typedef appears in brin_revmap.h */ + + +static BlockNumber revmap_get_blkno(BrinRevmap *revmap, + BlockNumber heapBlk); +static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk); +static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap, + BlockNumber heapBlk); +static void revmap_physical_extend(BrinRevmap *revmap); + +/* + * Initialize an access object for a range map. This must be freed by + * brinRevmapTerminate when caller is done with it. + */ +BrinRevmap * +brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange) +{ + BrinRevmap *revmap; + Buffer meta; + BrinMetaPageData *metadata; + + meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO); + LockBuffer(meta, BUFFER_LOCK_SHARE); + metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta)); + + revmap = palloc(sizeof(BrinRevmap)); + revmap->rm_irel = idxrel; + revmap->rm_pagesPerRange = metadata->pagesPerRange; + revmap->rm_lastRevmapPage = metadata->lastRevmapPage; + revmap->rm_metaBuf = meta; + revmap->rm_currBuf = InvalidBuffer; + + *pagesPerRange = metadata->pagesPerRange; + + LockBuffer(meta, BUFFER_LOCK_UNLOCK); + + return revmap; +} + +/* + * Release resources associated with a revmap access object. + */ +void +brinRevmapTerminate(BrinRevmap *revmap) +{ + ReleaseBuffer(revmap->rm_metaBuf); + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + pfree(revmap); +} + +/* + * Extend the revmap to cover the given heap block number. + */ +void +brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber mapBlk; + + mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk); + + /* Ensure the buffer we got is in the expected range */ + Assert(mapBlk != InvalidBlockNumber && + mapBlk != BRIN_METAPAGE_BLKNO && + mapBlk <= revmap->rm_lastRevmapPage); +} + +/* + * Prepare to insert an entry into the revmap; the revmap buffer in which the + * entry is to reside is locked and returned. Most callers should call + * brinRevmapExtend beforehand, as this routine does not extend the revmap if + * it's not long enough. + * + * The returned buffer is also recorded in the revmap struct; finishing that + * releases the buffer, therefore the caller needn't do it explicitely. + */ +Buffer +brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk) +{ + Buffer rmBuf; + + rmBuf = revmap_get_buffer(revmap, heapBlk); + LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE); + + return rmBuf; +} + +/* + * In the given revmap buffer (locked appropriately by caller), which is used + * in a BRIN index of pagesPerRange pages per range, set the element + * corresponding to heap block number heapBlk to the given TID. + * + * Once the operation is complete, the caller must update the LSN on the + * returned buffer. + * + * This is used both in regular operation and during WAL replay. + */ +void +brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, + BlockNumber heapBlk, ItemPointerData tid) +{ + RevmapContents *contents; + ItemPointerData *iptr; + Page page; + + /* The correct page should already be pinned and locked */ + page = BufferGetPage(buf); + contents = (RevmapContents *) PageGetContents(page); + iptr = (ItemPointerData *) contents->rm_tids; + iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); + + ItemPointerSet(iptr, + ItemPointerGetBlockNumber(&tid), + ItemPointerGetOffsetNumber(&tid)); +} + +/* + * Fetch the BrinTuple for a given heap block. + * + * The buffer containing the tuple is locked, and returned in *buf. As an + * optimization, the caller can pass a pinned buffer *buf on entry, which will + * avoid a pin-unpin cycle when the next tuple is on the same page as a + * previous one. + * + * If no tuple is found for the given heap range, returns NULL. In that case, + * *buf might still be updated, but it's not locked. + * + * The output tuple offset within the buffer is returned in *off, and its size + * is returned in *size. + */ +BrinTuple * +brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, + Buffer *buf, OffsetNumber *off, Size *size, int mode) +{ + Relation idxRel = revmap->rm_irel; + BlockNumber mapBlk; + RevmapContents *contents; + ItemPointerData *iptr; + BlockNumber blk; + Page page; + ItemId lp; + BrinTuple *tup; + ItemPointerData previptr; + + /* normalize the heap block number to be the first page in the range */ + heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; + + /* Compute the revmap page number we need */ + mapBlk = revmap_get_blkno(revmap, heapBlk); + if (mapBlk == InvalidBlockNumber) + { + *off = InvalidOffsetNumber; + return NULL; + } + + ItemPointerSetInvalid(&previptr); + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (revmap->rm_currBuf == InvalidBuffer || + BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) + { + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + + Assert(mapBlk != InvalidBlockNumber); + revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); + } + + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); + + contents = (RevmapContents *) + PageGetContents(BufferGetPage(revmap->rm_currBuf)); + iptr = contents->rm_tids; + iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); + + if (!ItemPointerIsValid(iptr)) + { + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + /* + * Check the TID we got in a previous iteration, if any, and save the + * current TID we got from the revmap; if we loop, we can sanity-check + * that the next one we get is different. Otherwise we might be stuck + * looping forever if the revmap is somehow badly broken. + */ + if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("corrupted BRIN index: inconsistent range map"))); + previptr = *iptr; + + blk = ItemPointerGetBlockNumber(iptr); + *off = ItemPointerGetOffsetNumber(iptr); + + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); + + /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ + if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) + { + if (BufferIsValid(*buf)) + ReleaseBuffer(*buf); + *buf = ReadBuffer(idxRel, blk); + } + LockBuffer(*buf, mode); + page = BufferGetPage(*buf); + + /* If we land on a revmap page, start over */ + if (BRIN_IS_REGULAR_PAGE(page)) + { + lp = PageGetItemId(page, *off); + if (ItemIdIsUsed(lp)) + { + tup = (BrinTuple *) PageGetItem(page, lp); + + if (tup->bt_blkno == heapBlk) + { + if (size) + *size = ItemIdGetLength(lp); + /* found it! */ + return tup; + } + } + } + + /* + * No luck. Assume that the revmap was updated concurrently. + */ + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + } + /* not reached, but keep compiler quiet */ + return NULL; +} + +/* + * Given a heap block number, find the corresponding physical revmap block + * number and return it. If the revmap page hasn't been allocated yet, return + * InvalidBlockNumber. + */ +static BlockNumber +revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber targetblk; + + /* obtain revmap block number, skip 1 for metapage block */ + targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; + + /* Normal case: the revmap page is already allocated */ + if (targetblk <= revmap->rm_lastRevmapPage) + return targetblk; + + return InvalidBlockNumber; +} + +/* + * Obtain and return a buffer containing the revmap page for the given heap + * page. The revmap must have been previously extended to cover that page. + * The returned buffer is also recorded in the revmap struct; finishing that + * releases the buffer, therefore the caller needn't do it explicitely. + */ +static Buffer +revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber mapBlk; + + /* Translate the heap block number to physical index location. */ + mapBlk = revmap_get_blkno(revmap, heapBlk); + + if (mapBlk == InvalidBlockNumber) + elog(ERROR, "revmap does not cover heap block %u", heapBlk); + + /* Ensure the buffer we got is in the expected range */ + Assert(mapBlk != BRIN_METAPAGE_BLKNO && + mapBlk <= revmap->rm_lastRevmapPage); + + BRIN_elog(DEBUG2, "getting revmap page for logical page %lu (physical %u) for heap %u", + HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk), + mapBlk, heapBlk); + + /* + * Obtain the buffer from which we need to read. If we already have the + * correct buffer in our access struct, use that; otherwise, release that, + * (if valid) and read the one we need. + */ + if (revmap->rm_currBuf == InvalidBuffer || + mapBlk != BufferGetBlockNumber(revmap->rm_currBuf)) + { + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + + revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); + } + + return revmap->rm_currBuf; +} + +/* + * Given a heap block number, find the corresponding physical revmap block + * number and return it. If the revmap page hasn't been allocated yet, extend + * the revmap until it is. + */ +static BlockNumber +revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber targetblk; + + /* obtain revmap block number, skip 1 for metapage block */ + targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; + + /* Extend the revmap, if necessary */ + while (targetblk > revmap->rm_lastRevmapPage) + { + CHECK_FOR_INTERRUPTS(); + revmap_physical_extend(revmap); + } + + return targetblk; +} + +/* + * Try to extend the revmap by one page. This might not happen for a number of + * reasons; caller is expected to retry until the expected outcome is obtained. + */ +static void +revmap_physical_extend(BrinRevmap *revmap) +{ + Buffer buf; + Page page; + Page metapage; + BrinMetaPageData *metadata; + BlockNumber mapBlk; + BlockNumber nblocks; + Relation irel = revmap->rm_irel; + bool needLock = !RELATION_IS_LOCAL(irel); + + /* + * Lock the metapage. This locks out concurrent extensions of the revmap, + * but note that we still need to grab the relation extension lock because + * another backend can extend the index with regular BRIN pages. + */ + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); + metapage = BufferGetPage(revmap->rm_metaBuf); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + /* + * Check that our cached lastRevmapPage value was up-to-date; if it + * wasn't, update the cached copy and have caller start over. + */ + if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage) + { + revmap->rm_lastRevmapPage = metadata->lastRevmapPage; + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + return; + } + mapBlk = metadata->lastRevmapPage + 1; + + nblocks = RelationGetNumberOfBlocks(irel); + if (mapBlk < nblocks) + { + buf = ReadBuffer(irel, mapBlk); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + } + else + { + if (needLock) + LockRelationForExtension(irel, ExclusiveLock); + + buf = ReadBuffer(irel, P_NEW); + if (BufferGetBlockNumber(buf) != mapBlk) + { + /* + * Very rare corner case: somebody extended the relation + * concurrently after we read its length. If this happens, give + * up and have caller start over. We will have to evacuate that + * page from under whoever is using it. + */ + if (needLock) + UnlockRelationForExtension(irel, ExclusiveLock); + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + return; + } + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + if (needLock) + UnlockRelationForExtension(irel, ExclusiveLock); + } + + /* Check that it's a regular block (or an empty page) */ + if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u", + BRIN_PAGE_TYPE(page), + RelationGetRelationName(irel), + BufferGetBlockNumber(buf)))); + + /* If the page is in use, evacuate it and restart */ + if (brin_start_evacuating_page(irel, buf)) + { + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf); + + /* have caller start over */ + return; + } + + /* + * Ok, we have now locked the metapage and the target block. Re-initialize + * it as a revmap page. + */ + START_CRIT_SECTION(); + + /* the rm_tids array is initialized to all invalid by PageInit */ + brin_page_init(page, BRIN_PAGETYPE_REVMAP); + MarkBufferDirty(buf); + + metadata->lastRevmapPage = mapBlk; + MarkBufferDirty(revmap->rm_metaBuf); + + if (RelationNeedsWAL(revmap->rm_irel)) + { + xl_brin_revmap_extend xlrec; + XLogRecPtr recptr; + XLogRecData rdata[2]; + + xlrec.node = revmap->rm_irel->rd_node; + xlrec.targetBlk = mapBlk; + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBrinRevmapExtend; + rdata[0].buffer = InvalidBuffer; + rdata[0].buffer_std = false; + rdata[0].next = &(rdata[1]); + + rdata[1].data = (char *) NULL; + rdata[1].len = 0; + rdata[1].buffer = revmap->rm_metaBuf; + rdata[1].buffer_std = false; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata); + PageSetLSN(metapage, recptr); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + + UnlockReleaseBuffer(buf); +} diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c new file mode 100644 index 00000000000..d895cb715cb --- /dev/null +++ b/src/backend/access/brin/brin_tuple.c @@ -0,0 +1,554 @@ +/* + * brin_tuples.c + * Method implementations for tuples in BRIN indexes. + * + * Intended usage is that code outside this file only deals with + * BrinMemTuples, and convert to and from the on-disk representation through + * functions in this file. + * + * NOTES + * + * A BRIN tuple is similar to a heap tuple, with a few key differences. The + * first interesting difference is that the tuple header is much simpler, only + * containing its total length and a small area for flags. Also, the stored + * data does not match the relation tuple descriptor exactly: for each + * attribute in the descriptor, the index tuple carries an arbitrary number + * of values, depending on the opclass. + * + * Also, for each column of the index relation there are two null bits: one + * (hasnulls) stores whether any tuple within the page range has that column + * set to null; the other one (allnulls) stores whether the column values are + * all null. If allnulls is true, then the tuple data area does not contain + * values for that column at all; whereas it does if the hasnulls is set. + * Note the size of the null bitmask may not be the same as that of the + * datum array. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_tuple.c + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/brin_tuple.h" +#include "access/tupdesc.h" +#include "access/tupmacs.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +static inline void brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls); + + +/* + * Return a tuple descriptor used for on-disk storage of BRIN tuples. + */ +static TupleDesc +brtuple_disk_tupdesc(BrinDesc *brdesc) +{ + /* We cache these in the BrinDesc */ + if (brdesc->bd_disktdesc == NULL) + { + int i; + int j; + AttrNumber attno = 1; + TupleDesc tupdesc; + MemoryContext oldcxt; + + /* make sure it's in the bdesc's context */ + oldcxt = MemoryContextSwitchTo(brdesc->bd_context); + + tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored, false); + + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++) + TupleDescInitEntry(tupdesc, attno++, NULL, + brdesc->bd_info[i]->oi_typids[j], + -1, 0); + } + + MemoryContextSwitchTo(oldcxt); + + brdesc->bd_disktdesc = tupdesc; + } + + return brdesc->bd_disktdesc; +} + +/* + * Generate a new on-disk tuple to be inserted in a BRIN index. + * + * See brin_form_placeholder_tuple if you touch this. + */ +BrinTuple * +brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, + Size *size) +{ + Datum *values; + bool *nulls; + bool anynulls = false; + BrinTuple *rettuple; + int keyno; + int idxattno; + uint16 phony_infomask; + bits8 *phony_nullbitmap; + Size len, + hoff, + data_len; + + Assert(brdesc->bd_totalstored > 0); + + values = palloc(sizeof(Datum) * brdesc->bd_totalstored); + nulls = palloc0(sizeof(bool) * brdesc->bd_totalstored); + phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); + + /* + * Set up the values/nulls arrays for heap_fill_tuple + */ + idxattno = 0; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int datumno; + + /* + * "allnulls" is set when there's no nonnull value in any row in the + * column; when this happens, there is no data to store. Thus set the + * nullable bits for all data elements of this column and we're done. + */ + if (tuple->bt_columns[keyno].bv_allnulls) + { + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + nulls[idxattno++] = true; + anynulls = true; + continue; + } + + /* + * The "hasnulls" bit is set when there are some null values in the + * data. We still need to store a real value, but the presence of + * this means we need a null bitmap. + */ + if (tuple->bt_columns[keyno].bv_hasnulls) + anynulls = true; + + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno]; + } + + /* compute total space needed */ + len = SizeOfBrinTuple; + if (anynulls) + { + /* + * We need a double-length bitmap on an on-disk BRIN index tuple; the + * first half stores the "allnulls" bits, the second stores + * "hasnulls". + */ + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + } + + len = hoff = MAXALIGN(len); + + data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc), + values, nulls); + + len += data_len; + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff); + + /* + * The infomask and null bitmap as computed by heap_fill_tuple are useless + * to us. However, that function will not accept a null infomask; and we + * need to pass a valid null bitmap so that it will correctly skip + * outputting null attributes in the data area. + */ + heap_fill_tuple(brtuple_disk_tupdesc(brdesc), + values, + nulls, + (char *) rettuple + hoff, + data_len, + &phony_infomask, + phony_nullbitmap); + + /* done with these */ + pfree(values); + pfree(nulls); + pfree(phony_nullbitmap); + + /* + * Now fill in the real null bitmasks. allnulls first. + */ + if (anynulls) + { + bits8 *bitP; + int bitmask; + + rettuple->bt_info |= BRIN_NULLS_MASK; + + /* + * Note that we reverse the sense of null bits in this module: we + * store a 1 for a null attribute rather than a 0. So we must reverse + * the sense of the att_isnull test in br_deconstruct_tuple as well. + */ + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_allnulls) + continue; + + *bitP |= bitmask; + } + /* hasnulls bits follow */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_hasnulls) + continue; + + *bitP |= bitmask; + } + bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1; + } + + if (tuple->bt_placeholder) + rettuple->bt_info |= BRIN_PLACEHOLDER_MASK; + + *size = len; + return rettuple; +} + +/* + * Generate a new on-disk tuple with no data values, marked as placeholder. + * + * This is a cut-down version of brin_form_tuple. + */ +BrinTuple * +brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size) +{ + Size len; + Size hoff; + BrinTuple *rettuple; + int keyno; + bits8 *bitP; + int bitmask; + + /* compute total space needed: always add nulls */ + len = SizeOfBrinTuple; + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + len = hoff = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK; + + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + /* set allnulls true for all attributes */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + *bitP |= bitmask; + } + /* no need to set hasnulls */ + + *size = len; + return rettuple; +} + +/* + * Free a tuple created by brin_form_tuple + */ +void +brin_free_tuple(BrinTuple *tuple) +{ + pfree(tuple); +} + +/* + * Create an palloc'd copy of a BrinTuple. + */ +BrinTuple * +brin_copy_tuple(BrinTuple *tuple, Size len) +{ + BrinTuple *newtup; + + newtup = palloc(len); + memcpy(newtup, tuple, len); + + return newtup; +} + +/* + * Return whether two BrinTuples are bitwise identical. + */ +bool +brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen) +{ + if (alen != blen) + return false; + if (memcmp(a, b, alen) != 0) + return false; + return true; +} + +/* + * Create a new BrinMemTuple from scratch, and initialize it to an empty + * state. + * + * Note: we don't provide any means to free a deformed tuple, so make sure to + * use a temporary memory context. + */ +BrinMemTuple * +brin_new_memtuple(BrinDesc *brdesc) +{ + BrinMemTuple *dtup; + char *currdatum; + long basesize; + int i; + + basesize = MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); + currdatum = (char *) dtup + basesize; + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + dtup->bt_columns[i].bv_attno = i + 1; + dtup->bt_columns[i].bv_allnulls = true; + dtup->bt_columns[i].bv_hasnulls = false; + dtup->bt_columns[i].bv_values = (Datum *) currdatum; + currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; + } + + dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext, + "brin dtuple", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + return dtup; +} + +/* + * Reset a BrinMemTuple to initial state + */ +void +brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) +{ + int i; + + MemoryContextReset(dtuple->bt_context); + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + dtuple->bt_columns[i].bv_allnulls = true; + dtuple->bt_columns[i].bv_hasnulls = false; + } +} + +/* + * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of + * brin_form_tuple. + * + * Note we don't need the "on disk tupdesc" here; we rely on our own routine to + * deconstruct the tuple from the on-disk format. + */ +BrinMemTuple * +brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple) +{ + BrinMemTuple *dtup; + Datum *values; + bool *allnulls; + bool *hasnulls; + char *tp; + bits8 *nullbits; + int keyno; + int valueno; + MemoryContext oldcxt; + + dtup = brin_new_memtuple(brdesc); + + if (BrinTupleIsPlaceholder(tuple)) + dtup->bt_placeholder = true; + dtup->bt_blkno = tuple->bt_blkno; + + values = palloc(sizeof(Datum) * brdesc->bd_totalstored); + allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + + tp = (char *) tuple + BrinTupleDataOffset(tuple); + + if (BrinTupleHasNulls(tuple)) + nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple); + else + nullbits = NULL; + brin_deconstruct_tuple(brdesc, + tp, nullbits, BrinTupleHasNulls(tuple), + values, allnulls, hasnulls); + + /* + * Iterate to assign each of the values to the corresponding item in the + * values array of each column. The copies occur in the tuple's context. + */ + oldcxt = MemoryContextSwitchTo(dtup->bt_context); + for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int i; + + if (allnulls[keyno]) + { + valueno += brdesc->bd_info[keyno]->oi_nstored; + continue; + } + + /* + * We would like to skip datumCopy'ing the values datum in some cases, + * caller permitting ... + */ + for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++) + dtup->bt_columns[keyno].bv_values[i] = + datumCopy(values[valueno++], + brdesc->bd_tupdesc->attrs[keyno]->attbyval, + brdesc->bd_tupdesc->attrs[keyno]->attlen); + + dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno]; + dtup->bt_columns[keyno].bv_allnulls = false; + } + + MemoryContextSwitchTo(oldcxt); + + pfree(values); + pfree(allnulls); + pfree(hasnulls); + + return dtup; +} + +/* + * brin_deconstruct_tuple + * Guts of attribute extraction from an on-disk BRIN tuple. + * + * Its arguments are: + * brdesc BRIN descriptor for the stored tuple + * tp pointer to the tuple data area + * nullbits pointer to the tuple nulls bitmask + * nulls "has nulls" bit in tuple infomask + * values output values, array of size brdesc->bd_totalstored + * allnulls output "allnulls", size brdesc->bd_tupdesc->natts + * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts + * + * Output arrays must have been allocated by caller. + */ +static inline void +brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls) +{ + int attnum; + int stored; + TupleDesc diskdsc; + long off; + + /* + * First iterate to natts to obtain both null flags for each attribute. + * Note that we reverse the sense of the att_isnull test, because we store + * 1 for a null value (rather than a 1 for a not null value as is the + * att_isnull convention used elsewhere.) See brin_form_tuple. + */ + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + /* + * the "all nulls" bit means that all values in the page range for + * this column are nulls. Therefore there are no values in the tuple + * data area. + */ + allnulls[attnum] = nulls && !att_isnull(attnum, nullbits); + + /* + * the "has nulls" bit means that some tuples have nulls, but others + * have not-null values. Therefore we know the tuple contains data + * for this column. + * + * The hasnulls bits follow the allnulls bits in the same bitmask. + */ + hasnulls[attnum] = + nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits); + } + + /* + * Iterate to obtain each attribute's stored values. Note that since we + * may reuse attribute entries for more than one column, we cannot cache + * offsets here. + */ + diskdsc = brtuple_disk_tupdesc(brdesc); + stored = 0; + off = 0; + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + int datumno; + + if (allnulls[attnum]) + { + stored += brdesc->bd_info[attnum]->oi_nstored; + continue; + } + + for (datumno = 0; + datumno < brdesc->bd_info[attnum]->oi_nstored; + datumno++) + { + Form_pg_attribute thisatt = diskdsc->attrs[stored]; + + if (thisatt->attlen == -1) + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + } + + values[stored++] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + } + } +} diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c new file mode 100644 index 00000000000..8dc80ad1e52 --- /dev/null +++ b/src/backend/access/brin/brin_xlog.c @@ -0,0 +1,291 @@ +/* + * brin_xlog.c + * XLog replay routines for BRIN indexes + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_xlog.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/xlogutils.h" + + +/* + * xlog replay routines + */ +static void +brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record) +{ + xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record); + Buffer buf; + Page page; + + /* Backup blocks are not used in create_index records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + + /* create the index' metapage */ + buf = XLogReadBuffer(xlrec->node, BRIN_METAPAGE_BLKNO, true); + Assert(BufferIsValid(buf)); + page = (Page) BufferGetPage(buf); + brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version); + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); +} + +/* + * Common part of an insert or update. Inserts the new tuple and updates the + * revmap. + */ +static void +brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record, + xl_brin_insert *xlrec, BrinTuple *tuple) +{ + BlockNumber blkno; + Buffer buffer; + Page page; + XLogRedoAction action; + + blkno = ItemPointerGetBlockNumber(&xlrec->tid); + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (record->xl_info & XLOG_BRIN_INIT_PAGE) + { + XLogReadBufferForRedoExtended(lsn, record, 0, + xlrec->node, MAIN_FORKNUM, blkno, + RBM_ZERO, false, &buffer); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + action = BLK_NEEDS_REDO; + } + else + { + action = XLogReadBufferForRedo(lsn, record, 0, + xlrec->node, blkno, &buffer); + } + + /* insert the index item into the page */ + if (action == BLK_NEEDS_REDO) + { + OffsetNumber offnum; + + Assert(tuple->bt_blkno == xlrec->heapBlk); + + page = (Page) BufferGetPage(buffer); + offnum = ItemPointerGetOffsetNumber(&(xlrec->tid)); + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); + + offnum = PageAddItem(page, (Item) tuple, xlrec->tuplen, offnum, true, + false); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* update the revmap */ + action = XLogReadBufferForRedo(lsn, record, 1, xlrec->node, + xlrec->revmapBlk, &buffer); + if (action == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, + xlrec->tid); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* XXX no FSM updates here ... */ +} + +/* + * replay a BRIN index insertion + */ +static void +brin_xlog_insert(XLogRecPtr lsn, XLogRecord *record) +{ + xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record); + BrinTuple *newtup; + + newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinInsert); + + brin_xlog_insert_update(lsn, record, xlrec, newtup); +} + +/* + * replay a BRIN index update + */ +static void +brin_xlog_update(XLogRecPtr lsn, XLogRecord *record) +{ + xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record); + BlockNumber blkno; + Buffer buffer; + BrinTuple *newtup; + XLogRedoAction action; + + newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinUpdate); + + /* First remove the old tuple */ + blkno = ItemPointerGetBlockNumber(&(xlrec->oldtid)); + action = XLogReadBufferForRedo(lsn, record, 2, xlrec->new.node, + blkno, &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page; + OffsetNumber offnum; + + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(&(xlrec->oldtid)); + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "brin_xlog_update: invalid max offset number"); + + PageIndexDeleteNoCompact(page, &offnum, 1); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* Then insert the new tuple and update revmap, like in an insertion. */ + brin_xlog_insert_update(lsn, record, &xlrec->new, newtup); + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Update a tuple on a single page. + */ +static void +brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record) +{ + xl_brin_samepage_update *xlrec; + BlockNumber blkno; + Buffer buffer; + XLogRedoAction action; + + xlrec = (xl_brin_samepage_update *) XLogRecGetData(record); + blkno = ItemPointerGetBlockNumber(&(xlrec->tid)); + action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, blkno, + &buffer); + if (action == BLK_NEEDS_REDO) + { + int tuplen; + BrinTuple *mmtuple; + Page page; + OffsetNumber offnum; + + tuplen = record->xl_len - SizeOfBrinSamepageUpdate; + mmtuple = (BrinTuple *) ((char *) xlrec + SizeOfBrinSamepageUpdate); + + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(&(xlrec->tid)); + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "brin_xlog_samepage_update: invalid max offset number"); + + PageIndexDeleteNoCompact(page, &offnum, 1); + offnum = PageAddItem(page, (Item) mmtuple, tuplen, offnum, true, false); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "brin_xlog_samepage_update: failed to add tuple"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* XXX no FSM updates here ... */ +} + +/* + * Replay a revmap page extension + */ +static void +brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record) +{ + xl_brin_revmap_extend *xlrec; + Buffer metabuf; + Buffer buf; + Page page; + XLogRedoAction action; + + xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); + /* Update the metapage */ + action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, + BRIN_METAPAGE_BLKNO, &metabuf); + if (action == BLK_NEEDS_REDO) + { + Page metapg; + BrinMetaPageData *metadata; + + metapg = BufferGetPage(metabuf); + metadata = (BrinMetaPageData *) PageGetContents(metapg); + + Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1); + metadata->lastRevmapPage = xlrec->targetBlk; + + PageSetLSN(metapg, lsn); + MarkBufferDirty(metabuf); + } + + /* + * Re-init the target block as a revmap page. There's never a full- page + * image here. + */ + + buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true); + page = (Page) BufferGetPage(buf); + brin_page_init(page, BRIN_PAGETYPE_REVMAP); + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + + UnlockReleaseBuffer(buf); + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +void +brin_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + switch (info & XLOG_BRIN_OPMASK) + { + case XLOG_BRIN_CREATE_INDEX: + brin_xlog_createidx(lsn, record); + break; + case XLOG_BRIN_INSERT: + brin_xlog_insert(lsn, record); + break; + case XLOG_BRIN_UPDATE: + brin_xlog_update(lsn, record); + break; + case XLOG_BRIN_SAMEPAGE_UPDATE: + brin_xlog_samepage_update(lsn, record); + break; + case XLOG_BRIN_REVMAP_EXTEND: + brin_xlog_revmap_extend(lsn, record); + break; + default: + elog(PANIC, "brin_redo: unknown op code %u", info); + } +} diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index e0b81b9eb51..c55a7758273 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -209,6 +209,13 @@ static relopt_int intRelOpts[] = RELOPT_KIND_HEAP | RELOPT_KIND_TOAST }, -1, 0, 2000000000 }, + { + { + "pages_per_range", + "Number of pages that each page range covers in a BRIN index", + RELOPT_KIND_BRIN + }, 128, 1, 131072 + }, /* list terminator */ {{NULL}} diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 8f671ac4342..43098f44422 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -272,6 +272,8 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) scan->rs_startblock = 0; } + scan->rs_initblock = 0; + scan->rs_numblocks = InvalidBlockNumber; scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); @@ -297,6 +299,14 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) pgstat_count_heap_scan(scan->rs_rd); } +void +heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks) +{ + scan->rs_startblock = startBlk; + scan->rs_initblock = startBlk; + scan->rs_numblocks = numBlks; +} + /* * heapgetpage - subroutine for heapgettup() * @@ -637,7 +647,8 @@ heapgettup(HeapScanDesc scan, */ if (backward) { - finished = (page == scan->rs_startblock); + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false); if (page == 0) page = scan->rs_nblocks; page--; @@ -647,7 +658,8 @@ heapgettup(HeapScanDesc scan, page++; if (page >= scan->rs_nblocks) page = 0; - finished = (page == scan->rs_startblock); + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false); /* * Report our new scan position for synchronization purposes. We @@ -898,7 +910,8 @@ heapgettup_pagemode(HeapScanDesc scan, */ if (backward) { - finished = (page == scan->rs_startblock); + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false); if (page == 0) page = scan->rs_nblocks; page--; @@ -908,7 +921,8 @@ heapgettup_pagemode(HeapScanDesc scan, page++; if (page >= scan->rs_nblocks) page = 0; - finished = (page == scan->rs_startblock); + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false); /* * Report our new scan position for synchronization purposes. We diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile index 7d092d205d6..32cb985036c 100644 --- a/src/backend/access/rmgrdesc/Makefile +++ b/src/backend/access/rmgrdesc/Makefile @@ -8,7 +8,8 @@ subdir = src/backend/access/rmgrdesc top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \ +OBJS = brindesc.o clogdesc.o dbasedesc.o gindesc.o gistdesc.o \ + hashdesc.o heapdesc.o \ mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \ standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o diff --git a/src/backend/access/rmgrdesc/brindesc.c b/src/backend/access/rmgrdesc/brindesc.c new file mode 100644 index 00000000000..39135bf52e7 --- /dev/null +++ b/src/backend/access/rmgrdesc/brindesc.c @@ -0,0 +1,112 @@ +/*------------------------------------------------------------------------- + * + * brindesc.c + * rmgr descriptor routines for BRIN indexes + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/brindesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/brin_xlog.h" + +void +brin_desc(StringInfo buf, XLogRecord *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + info &= XLOG_BRIN_OPMASK; + if (info == XLOG_BRIN_CREATE_INDEX) + { + xl_brin_createidx *xlrec = (xl_brin_createidx *) rec; + + appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u", + xlrec->version, xlrec->pagesPerRange, + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode); + } + else if (info == XLOG_BRIN_INSERT) + { + xl_brin_insert *xlrec = (xl_brin_insert *) rec; + + appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, + xlrec->heapBlk, xlrec->revmapBlk, + xlrec->pagesPerRange, + ItemPointerGetBlockNumber(&xlrec->tid), + ItemPointerGetOffsetNumber(&xlrec->tid)); + } + else if (info == XLOG_BRIN_UPDATE) + { + xl_brin_update *xlrec = (xl_brin_update *) rec; + + appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)", + xlrec->new.node.spcNode, xlrec->new.node.dbNode, + xlrec->new.node.relNode, + xlrec->new.heapBlk, xlrec->new.revmapBlk, + xlrec->new.pagesPerRange, + ItemPointerGetBlockNumber(&xlrec->oldtid), + ItemPointerGetOffsetNumber(&xlrec->oldtid), + ItemPointerGetBlockNumber(&xlrec->new.tid), + ItemPointerGetOffsetNumber(&xlrec->new.tid)); + } + else if (info == XLOG_BRIN_SAMEPAGE_UPDATE) + { + xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec; + + appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, + ItemPointerGetBlockNumber(&xlrec->tid), + ItemPointerGetOffsetNumber(&xlrec->tid)); + } + else if (info == XLOG_BRIN_REVMAP_EXTEND) + { + xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec; + + appendStringInfo(buf, "rel %u/%u/%u targetBlk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->targetBlk); + } +} + +const char * +brin_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_BRIN_CREATE_INDEX: + id = "CREATE_INDEX"; + break; + case XLOG_BRIN_INSERT: + id = "INSERT"; + break; + case XLOG_BRIN_INSERT | XLOG_BRIN_INIT_PAGE: + id = "INSERT+INIT"; + break; + case XLOG_BRIN_UPDATE: + id = "UPDATE"; + break; + case XLOG_BRIN_UPDATE | XLOG_BRIN_INIT_PAGE: + id = "UPDATE+INIT"; + break; + case XLOG_BRIN_SAMEPAGE_UPDATE: + id = "SAMEPAGE_UPDATE"; + break; + case XLOG_BRIN_REVMAP_EXTEND: + id = "REVMAP_EXTEND"; + break; + } + + return id; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 2645a7a3685..befd60f2d37 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -12,6 +12,7 @@ #include "access/gist_private.h" #include "access/hash.h" #include "access/heapam_xlog.h" +#include "access/brin_xlog.h" #include "access/multixact.h" #include "access/nbtree.h" #include "access/spgist.h" diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 0c31aa95d70..912038a712e 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2104,6 +2104,27 @@ IndexBuildHeapScan(Relation heapRelation, IndexBuildCallback callback, void *callback_state) { + return IndexBuildHeapRangeScan(heapRelation, indexRelation, + indexInfo, allow_sync, + 0, InvalidBlockNumber, + callback, callback_state); +} + +/* + * As above, except that instead of scanning the complete heap, only the given + * number of blocks are scanned. Scan to end-of-rel can be signalled by + * passing InvalidBlockNumber as numblocks. + */ +double +IndexBuildHeapRangeScan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + bool allow_sync, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state) +{ bool is_system_catalog; bool checking_uniqueness; HeapScanDesc scan; @@ -2174,6 +2195,9 @@ IndexBuildHeapScan(Relation heapRelation, true, /* buffer access strategy OK */ allow_sync); /* syncscan OK? */ + /* set our scan endpoints */ + heap_setscanlimits(scan, start_blockno, numblocks); + reltuples = 0; /* diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 9f1b20e04ab..8e78aafda7c 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -132,6 +132,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record) case RM_GIST_ID: case RM_SEQ_ID: case RM_SPGIST_ID: + case RM_BRIN_ID: break; case RM_NEXT_ID: elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid); diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 6351a9bea47..2b858c82719 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -399,7 +399,8 @@ PageRestoreTempPage(Page tempPage, Page oldPage) } /* - * sorting support for PageRepairFragmentation and PageIndexMultiDelete + * sorting support for PageRepairFragmentation, PageIndexMultiDelete, + * PageIndexDeleteNoCompact */ typedef struct itemIdSortData { @@ -896,6 +897,182 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) phdr->pd_upper = upper; } +/* + * PageIndexDeleteNoCompact + * Delete the given items for an index page, and defragment the resulting + * free space, but do not compact the item pointers array. + * + * itemnos is the array of tuples to delete; nitems is its size. maxIdxTuples + * is the maximum number of tuples that can exist in a page. + * + * Unused items at the end of the array are removed. + * + * This is used for index AMs that require that existing TIDs of live tuples + * remain unchanged. + */ +void +PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems) +{ + PageHeader phdr = (PageHeader) page; + LocationIndex pd_lower = phdr->pd_lower; + LocationIndex pd_upper = phdr->pd_upper; + LocationIndex pd_special = phdr->pd_special; + int nline; + bool empty; + OffsetNumber offnum; + int nextitm; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (pd_lower < SizeOfPageHeaderData || + pd_lower > pd_upper || + pd_upper > pd_special || + pd_special > BLCKSZ || + pd_special != MAXALIGN(pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + pd_lower, pd_upper, pd_special))); + + /* + * Scan the existing item pointer array and mark as unused those that are + * in our kill-list; make sure any non-interesting ones are marked unused + * as well. + */ + nline = PageGetMaxOffsetNumber(page); + empty = true; + nextitm = 0; + for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) + { + ItemId lp; + ItemLength itemlen; + ItemOffset offset; + + lp = PageGetItemId(page, offnum); + + itemlen = ItemIdGetLength(lp); + offset = ItemIdGetOffset(lp); + + if (ItemIdIsUsed(lp)) + { + if (offset < pd_upper || + (offset + itemlen) > pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item pointer: offset = %u, length = %u", + offset, (unsigned int) itemlen))); + + if (nextitm < nitems && offnum == itemnos[nextitm]) + { + /* this one is on our list to delete, so mark it unused */ + ItemIdSetUnused(lp); + nextitm++; + } + else if (ItemIdHasStorage(lp)) + { + /* This one's live -- must do the compaction dance */ + empty = false; + } + else + { + /* get rid of this one too */ + ItemIdSetUnused(lp); + } + } + } + + /* this will catch invalid or out-of-order itemnos[] */ + if (nextitm != nitems) + elog(ERROR, "incorrect index offsets supplied"); + + if (empty) + { + /* Page is completely empty, so just reset it quickly */ + phdr->pd_lower = SizeOfPageHeaderData; + phdr->pd_upper = pd_special; + } + else + { + /* There are live items: need to compact the page the hard way */ + itemIdSortData itemidbase[MaxOffsetNumber]; + itemIdSort itemidptr; + int i; + Size totallen; + Offset upper; + + /* + * Scan the page taking note of each item that we need to preserve. + * This includes both live items (those that contain data) and + * interspersed unused ones. It's critical to preserve these unused + * items, because otherwise the offset numbers for later live items + * would change, which is not acceptable. Unused items might get used + * again later; that is fine. + */ + itemidptr = itemidbase; + totallen = 0; + for (i = 0; i < nline; i++, itemidptr++) + { + ItemId lp; + + itemidptr->offsetindex = i; + + lp = PageGetItemId(page, i + 1); + if (ItemIdHasStorage(lp)) + { + itemidptr->itemoff = ItemIdGetOffset(lp); + itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); + totallen += itemidptr->alignedlen; + } + else + { + itemidptr->itemoff = 0; + itemidptr->alignedlen = 0; + } + } + /* By here, there are exactly nline elements in itemidbase array */ + + if (totallen > (Size) (pd_special - pd_lower)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item lengths: total %u, available space %u", + (unsigned int) totallen, pd_special - pd_lower))); + + /* sort itemIdSortData array into decreasing itemoff order */ + qsort((char *) itemidbase, nline, sizeof(itemIdSortData), + itemoffcompare); + + /* + * Defragment the data areas of each tuple, being careful to preserve + * each item's position in the linp array. + */ + upper = pd_special; + PageClearHasFreeLinePointers(page); + for (i = 0, itemidptr = itemidbase; i < nline; i++, itemidptr++) + { + ItemId lp; + + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + if (itemidptr->alignedlen == 0) + { + PageSetHasFreeLinePointers(page); + ItemIdSetUnused(lp); + continue; + } + upper -= itemidptr->alignedlen; + memmove((char *) page + upper, + (char *) page + itemidptr->itemoff, + itemidptr->alignedlen); + lp->lp_off = upper; + /* lp_flags and lp_len remain the same as originally */ + } + + /* Set the new page limits */ + phdr->pd_upper = upper; + phdr->pd_lower = SizeOfPageHeaderData + i * sizeof(ItemIdData); + } +} /* * Set checksum for a page in shared buffers. diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index e932ccf0da5..ea9150b23f0 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6081,7 +6081,7 @@ genericcostestimate(PlannerInfo *root, else numIndexPages = 1.0; - /* fetch estimated page cost for schema containing index */ + /* fetch estimated page cost for tablespace containing index */ get_tablespace_page_costs(index->reltablespace, &spc_random_page_cost, NULL); @@ -7162,7 +7162,7 @@ gincostestimate(PG_FUNCTION_ARGS) JOIN_INNER, NULL); - /* fetch estimated page cost for schema containing index */ + /* fetch estimated page cost for tablespace containing index */ get_tablespace_page_costs(index->reltablespace, &spc_random_page_cost, NULL); @@ -7349,3 +7349,73 @@ gincostestimate(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * BRIN has search behavior completely different from other index types + */ +Datum +brincostestimate(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + IndexPath *path = (IndexPath *) PG_GETARG_POINTER(1); + double loop_count = PG_GETARG_FLOAT8(2); + Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3); + Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); + Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); + double *indexCorrelation = (double *) PG_GETARG_POINTER(6); + IndexOptInfo *index = path->indexinfo; + List *indexQuals = path->indexquals; + List *indexOrderBys = path->indexorderbys; + double numPages = index->pages; + double numTuples = index->tuples; + Cost spc_seq_page_cost; + Cost spc_random_page_cost; + QualCost index_qual_cost; + double qual_op_cost; + double qual_arg_cost; + + /* fetch estimated page cost for tablespace containing index */ + get_tablespace_page_costs(index->reltablespace, + &spc_random_page_cost, + &spc_seq_page_cost); + + /* + * BRIN indexes are always read in full; use that as startup cost. + * XXX maybe only include revmap pages here? + */ + *indexStartupCost = spc_seq_page_cost * numPages * loop_count; + + /* + * To read a BRIN index there might be a bit of back and forth over regular + * pages, as revmap might point to them out of sequential order; calculate + * this as reading the whole index in random order. + */ + *indexTotalCost = spc_random_page_cost * numPages * loop_count; + + *indexSelectivity = + clauselist_selectivity(root, path->indexquals, + path->indexinfo->rel->relid, + JOIN_INNER, NULL); + *indexCorrelation = 1; + + /* + * Add on index qual eval costs, much as in genericcostestimate. + */ + cost_qual_eval(&index_qual_cost, indexQuals, root); + qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple; + cost_qual_eval(&index_qual_cost, indexOrderBys, root); + qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple; + qual_op_cost = cpu_operator_cost * + (list_length(indexQuals) + list_length(indexOrderBys)); + qual_arg_cost -= qual_op_cost; + if (qual_arg_cost < 0) /* just in case... */ + qual_arg_cost = 0; + + *indexStartupCost += qual_arg_cost; + *indexTotalCost += qual_arg_cost; + *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost); + + /* XXX what about pages_per_range? */ + + PG_RETURN_VOID(); +} diff --git a/src/include/access/brin.h b/src/include/access/brin.h new file mode 100644 index 00000000000..a522c2062b9 --- /dev/null +++ b/src/include/access/brin.h @@ -0,0 +1,52 @@ +/* + * AM-callable functions for BRIN indexes + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin.h + */ +#ifndef BRIN_H +#define BRIN_H + +#include "fmgr.h" +#include "nodes/execnodes.h" +#include "utils/relcache.h" + + +/* + * prototypes for functions in brin.c (external entry points for BRIN) + */ +extern Datum brinbuild(PG_FUNCTION_ARGS); +extern Datum brinbuildempty(PG_FUNCTION_ARGS); +extern Datum brininsert(PG_FUNCTION_ARGS); +extern Datum brinbeginscan(PG_FUNCTION_ARGS); +extern Datum bringettuple(PG_FUNCTION_ARGS); +extern Datum bringetbitmap(PG_FUNCTION_ARGS); +extern Datum brinrescan(PG_FUNCTION_ARGS); +extern Datum brinendscan(PG_FUNCTION_ARGS); +extern Datum brinmarkpos(PG_FUNCTION_ARGS); +extern Datum brinrestrpos(PG_FUNCTION_ARGS); +extern Datum brinbulkdelete(PG_FUNCTION_ARGS); +extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS); +extern Datum brincanreturn(PG_FUNCTION_ARGS); +extern Datum brincostestimate(PG_FUNCTION_ARGS); +extern Datum brinoptions(PG_FUNCTION_ARGS); + +/* + * Storage type for BRIN's reloptions + */ +typedef struct BrinOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + BlockNumber pagesPerRange; +} BrinOptions; + +#define BRIN_DEFAULT_PAGES_PER_RANGE 128 +#define BrinGetPagesPerRange(relation) \ + ((relation)->rd_options ? \ + ((BrinOptions *) (relation)->rd_options)->pagesPerRange : \ + BRIN_DEFAULT_PAGES_PER_RANGE) + +#endif /* BRIN_H */ diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h new file mode 100644 index 00000000000..651ab5f67e4 --- /dev/null +++ b/src/include/access/brin_internal.h @@ -0,0 +1,88 @@ +/* + * brin_internal.h + * internal declarations for BRIN indexes + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_internal.h + */ +#ifndef BRIN_INTERNAL_H +#define BRIN_INTERNAL_H + +#include "fmgr.h" +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "storage/off.h" +#include "utils/relcache.h" + + +/* + * A BrinDesc is a struct designed to enable decoding a BRIN tuple from the + * on-disk format to an in-memory tuple and vice-versa. + */ + +/* struct returned by "OpcInfo" amproc */ +typedef struct BrinOpcInfo +{ + /* Number of columns stored in an index column of this opclass */ + uint16 oi_nstored; + + /* Opaque pointer for the opclass' private use */ + void *oi_opaque; + + /* Type IDs of the stored columns */ + Oid oi_typids[FLEXIBLE_ARRAY_MEMBER]; +} BrinOpcInfo; + +/* the size of a BrinOpcInfo for the given number of columns */ +#define SizeofBrinOpcInfo(ncols) \ + (offsetof(BrinOpcInfo, oi_typids) + sizeof(Oid) * ncols) + +typedef struct BrinDesc +{ + /* Containing memory context */ + MemoryContext bd_context; + + /* the index relation itself */ + Relation bd_index; + + /* tuple descriptor of the index relation */ + TupleDesc bd_tupdesc; + + /* cached copy for on-disk tuples; generated at first use */ + TupleDesc bd_disktdesc; + + /* total number of Datum entries that are stored on-disk for all columns */ + int bd_totalstored; + + /* per-column info; bd_tupdesc->natts entries long */ + BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER]; +} BrinDesc; + +/* + * Globally-known function support numbers for BRIN indexes. Individual + * opclasses define their own function support numbers, which must not collide + * with the definitions here. + */ +#define BRIN_PROCNUM_OPCINFO 1 +#define BRIN_PROCNUM_ADDVALUE 2 +#define BRIN_PROCNUM_CONSISTENT 3 +#define BRIN_PROCNUM_UNION 4 +/* procedure numbers up to 10 are reserved for BRIN future expansion */ + +#define BRIN_DEBUG + +/* we allow debug if using GCC; otherwise don't bother */ +#if defined(BRIN_DEBUG) && defined(__GNUC__) +#define BRIN_elog(level, ...) elog(level, __VA_ARGS__) +#else +#define BRIN_elog(a) void(0) +#endif + +/* brin.c */ +extern BrinDesc *brin_build_desc(Relation rel); +extern void brin_free_desc(BrinDesc *bdesc); + +#endif /* BRIN_INTERNAL_H */ diff --git a/src/include/access/brin_page.h b/src/include/access/brin_page.h new file mode 100644 index 00000000000..636cf86eafa --- /dev/null +++ b/src/include/access/brin_page.h @@ -0,0 +1,70 @@ +/* + * brin_page.h + * Prototypes and definitions for BRIN page layouts + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_page.h + * + * NOTES + * + * These structs should really be private to specific BRIN files, but it's + * useful to have them here so that they can be used by pageinspect and similar + * tools. + */ +#ifndef BRIN_PAGE_H +#define BRIN_PAGE_H + +#include "storage/block.h" +#include "storage/itemptr.h" + +/* special space on all BRIN pages stores a "type" identifier */ +#define BRIN_PAGETYPE_META 0xF091 +#define BRIN_PAGETYPE_REVMAP 0xF092 +#define BRIN_PAGETYPE_REGULAR 0xF093 + +#define BRIN_PAGE_TYPE(page) \ + (((BrinSpecialSpace *) PageGetSpecialPointer(page))->type) +#define BRIN_IS_REVMAP_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REVMAP) +#define BRIN_IS_REGULAR_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REGULAR) + +/* flags for BrinSpecialSpace */ +#define BRIN_EVACUATE_PAGE (1 << 0) + +typedef struct BrinSpecialSpace +{ + uint16 flags; + uint16 type; +} BrinSpecialSpace; + +/* Metapage definitions */ +typedef struct BrinMetaPageData +{ + uint32 brinMagic; + uint32 brinVersion; + BlockNumber pagesPerRange; + BlockNumber lastRevmapPage; +} BrinMetaPageData; + +#define BRIN_CURRENT_VERSION 1 +#define BRIN_META_MAGIC 0xA8109CFA + +#define BRIN_METAPAGE_BLKNO 0 + +/* Definitions for revmap pages */ +typedef struct RevmapContents +{ + ItemPointerData rm_tids[1]; /* really REVMAP_PAGE_MAXITEMS */ +} RevmapContents; + +#define REVMAP_CONTENT_SIZE \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ + offsetof(RevmapContents, rm_tids) - \ + MAXALIGN(sizeof(BrinSpecialSpace))) +/* max num of items in the array */ +#define REVMAP_PAGE_MAXITEMS \ + (REVMAP_CONTENT_SIZE / sizeof(ItemPointerData)) + +#endif /* BRIN_PAGE_H */ diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h new file mode 100644 index 00000000000..86a9e81c2c5 --- /dev/null +++ b/src/include/access/brin_pageops.h @@ -0,0 +1,36 @@ +/* + * brin_pageops.h + * Prototypes for operating on BRIN pages. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_pageops.h + */ +#ifndef BRIN_PAGEOPS_H +#define BRIN_PAGEOPS_H + +#include "access/brin_revmap.h" + +extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, BlockNumber heapBlk, + Buffer oldbuf, OffsetNumber oldoff, + const BrinTuple *origtup, Size origsz, + const BrinTuple *newtup, Size newsz, + bool samepage); +extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz, + Size newsz); +extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, + BrinTuple *tup, Size itemsz); + +extern void brin_page_init(Page page, uint16 type); +extern void brin_metapage_init(Page page, BlockNumber pagesPerRange, + uint16 version); + +extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf); +extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer buf); + +#endif /* BRIN_PAGEOPS_H */ diff --git a/src/include/access/brin_revmap.h b/src/include/access/brin_revmap.h new file mode 100644 index 00000000000..ff0e7e6e281 --- /dev/null +++ b/src/include/access/brin_revmap.h @@ -0,0 +1,39 @@ +/* + * brin_revmap.h + * Prototypes for BRIN reverse range maps + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_revmap.h + */ + +#ifndef BRIN_REVMAP_H +#define BRIN_REVMAP_H + +#include "access/brin_tuple.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/itemptr.h" +#include "storage/off.h" +#include "utils/relcache.h" + +/* struct definition lives in brin_revmap.c */ +typedef struct BrinRevmap BrinRevmap; + +extern BrinRevmap *brinRevmapInitialize(Relation idxrel, + BlockNumber *pagesPerRange); +extern void brinRevmapTerminate(BrinRevmap *revmap); + +extern void brinRevmapExtend(BrinRevmap *revmap, + BlockNumber heapBlk); +extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap, + BlockNumber heapBlk); +extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange, + BlockNumber heapBlk, ItemPointerData tid); +extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap, + BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, + Size *size, int mode); + +#endif /* BRIN_REVMAP_H */ diff --git a/src/include/access/brin_tuple.h b/src/include/access/brin_tuple.h new file mode 100644 index 00000000000..00f55e7a2bf --- /dev/null +++ b/src/include/access/brin_tuple.h @@ -0,0 +1,96 @@ +/* + * brin_tuple.h + * Declarations for dealing with BRIN-specific tuples. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_tuple.h + */ +#ifndef BRIN_TUPLE_H +#define BRIN_TUPLE_H + +#include "access/brin_internal.h" +#include "access/tupdesc.h" + + +/* + * A BRIN index stores one index tuple per page range. Each index tuple + * has one BrinValues struct for each indexed column; in turn, each BrinValues + * has (besides the null flags) an array of Datum whose size is determined by + * the opclass. + */ +typedef struct BrinValues +{ + AttrNumber bv_attno; /* index attribute number */ + bool bv_hasnulls; /* is there any nulls in the page range? */ + bool bv_allnulls; /* are all values nulls in the page range? */ + Datum *bv_values; /* current accumulated values */ +} BrinValues; + +/* + * This struct is used to represent an in-memory index tuple. The values can + * only be meaningfully decoded with an appropriate BrinDesc. + */ +typedef struct BrinMemTuple +{ + bool bt_placeholder; /* this is a placeholder tuple */ + BlockNumber bt_blkno; /* heap blkno that the tuple is for */ + MemoryContext bt_context; /* memcxt holding the dt_column values */ + BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]; +} BrinMemTuple; + +/* + * An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with + * room for 2 null bits (two bits for each indexed column); an opclass-defined + * number of Datum values for each column follow. + */ +typedef struct BrinTuple +{ + /* heap block number that the tuple is for */ + BlockNumber bt_blkno; + + /* --------------- + * mt_info is laid out in the following fashion: + * + * 7th (high) bit: has nulls + * 6th bit: is placeholder tuple + * 5th bit: unused + * 4-0 bit: offset of data + * --------------- + */ + uint8 bt_info; +} BrinTuple; + +#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8)) + +/* + * t_info manipulation macros + */ +#define BRIN_OFFSET_MASK 0x1F +/* bit 0x20 is not used at present */ +#define BRIN_PLACEHOLDER_MASK 0x40 +#define BRIN_NULLS_MASK 0x80 + +#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK)) +#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0) +#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0) + + +extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, + BrinMemTuple *tuple, Size *size); +extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc, + BlockNumber blkno, Size *size); +extern void brin_free_tuple(BrinTuple *tuple); +extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len); +extern bool brin_tuples_equal(const BrinTuple *a, Size alen, + const BrinTuple *b, Size blen); + +extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc); +extern void brin_memtuple_initialize(BrinMemTuple *dtuple, + BrinDesc *brdesc); +extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc, + BrinTuple *tuple); + +#endif /* BRIN_TUPLE_H */ diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h new file mode 100644 index 00000000000..3d959e81d63 --- /dev/null +++ b/src/include/access/brin_xlog.h @@ -0,0 +1,109 @@ +/*------------------------------------------------------------------------- + * + * brin_xlog.h + * POSTGRES BRIN access XLOG definitions. + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/brin_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef BRIN_XLOG_H +#define BRIN_XLOG_H + +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" +#include "storage/bufpage.h" +#include "storage/itemptr.h" +#include "storage/relfilenode.h" +#include "utils/relcache.h" + + +/* + * WAL record definitions for BRIN's WAL operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field. + */ +#define XLOG_BRIN_CREATE_INDEX 0x00 +#define XLOG_BRIN_INSERT 0x10 +#define XLOG_BRIN_UPDATE 0x20 +#define XLOG_BRIN_SAMEPAGE_UPDATE 0x30 +#define XLOG_BRIN_REVMAP_EXTEND 0x40 +#define XLOG_BRIN_REVMAP_VACUUM 0x50 + +#define XLOG_BRIN_OPMASK 0x70 +/* + * When we insert the first item on a new page, we restore the entire page in + * redo. + */ +#define XLOG_BRIN_INIT_PAGE 0x80 + +/* This is what we need to know about a BRIN index create */ +typedef struct xl_brin_createidx +{ + BlockNumber pagesPerRange; + RelFileNode node; + uint16 version; +} xl_brin_createidx; +#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16)) + +/* + * This is what we need to know about a BRIN tuple insert + */ +typedef struct xl_brin_insert +{ + RelFileNode node; + BlockNumber heapBlk; + + /* extra information needed to update the revmap */ + BlockNumber revmapBlk; + BlockNumber pagesPerRange; + + uint16 tuplen; + ItemPointerData tid; + /* tuple data follows at end of struct */ +} xl_brin_insert; + +#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData)) + +/* + * A cross-page update is the same as an insert, but also store the old tid. + */ +typedef struct xl_brin_update +{ + ItemPointerData oldtid; + xl_brin_insert new; +} xl_brin_update; + +#define SizeOfBrinUpdate (offsetof(xl_brin_update, new) + SizeOfBrinInsert) + +/* This is what we need to know about a BRIN tuple samepage update */ +typedef struct xl_brin_samepage_update +{ + RelFileNode node; + ItemPointerData tid; + /* tuple data follows at end of struct */ +} xl_brin_samepage_update; + +#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData)) + +/* This is what we need to know about a revmap extension */ +typedef struct xl_brin_revmap_extend +{ + RelFileNode node; + BlockNumber targetBlk; +} xl_brin_revmap_extend; + +#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \ + sizeof(BlockNumber)) + + +extern void brin_desc(StringInfo buf, XLogRecord *record); +extern void brin_redo(XLogRecPtr lsn, XLogRecord *record); +extern const char *brin_identify(uint8 info); + +#endif /* BRIN_XLOG_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 7f7166d832e..9cd66a1b0f9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -113,6 +113,8 @@ extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot, bool allow_strat, bool allow_sync); extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key); +extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, + BlockNumber endBlk); extern void heap_rescan(HeapScanDesc scan, ScanKey key); extern void heap_endscan(HeapScanDesc scan); extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index c22644841f9..a538830be58 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -45,8 +45,9 @@ typedef enum relopt_kind RELOPT_KIND_TABLESPACE = (1 << 7), RELOPT_KIND_SPGIST = (1 << 8), RELOPT_KIND_VIEW = (1 << 9), + RELOPT_KIND_BRIN = (1 << 10), /* if you add a new kind, make sure you update "last_default" too */ - RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_VIEW, + RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_BRIN, /* some compilers treat enums as signed ints, so we can't use 1 << 31 */ RELOPT_KIND_MAX = (1 << 30) } relopt_kind; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 8a57698feb6..8beb1be8829 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -35,8 +35,10 @@ typedef struct HeapScanDescData bool rs_temp_snap; /* unregister snapshot at scan end? */ /* state set up at initscan time */ - BlockNumber rs_nblocks; /* number of blocks to scan */ + BlockNumber rs_nblocks; /* total number of blocks in rel */ BlockNumber rs_startblock; /* block # to start at */ + BlockNumber rs_initblock; /* block # to consider initial of rel */ + BlockNumber rs_numblocks; /* number of blocks to scan */ BufferAccessStrategy rs_strategy; /* access strategy for reads */ bool rs_syncscan; /* report location to syncscan logic? */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 77d4574ed17..76a6421fb68 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -42,3 +42,4 @@ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gi PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup) PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL) PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL) diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 2c059c88c18..b5c5e7aa5e8 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201411041 +#define CATALOG_VERSION_NO 201411071 #endif diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 098ac7df199..c36a729c91f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -98,6 +98,14 @@ extern double IndexBuildHeapScan(Relation heapRelation, bool allow_sync, IndexBuildCallback callback, void *callback_state); +extern double IndexBuildHeapRangeScan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + bool allow_sync, + BlockNumber start_blockno, + BlockNumber end_blockno, + IndexBuildCallback callback, + void *callback_state); extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot); diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index 759ea705702..67b57cda915 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -132,5 +132,7 @@ DESCR("GIN index access method"); DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions )); DESCR("SP-GiST index access method"); #define SPGIST_AM_OID 4000 +DATA(insert OID = 3580 ( brin 5 14 f f f f t t f t t f f 0 brininsert brinbeginscan - bringetbitmap brinrescan brinendscan brinmarkpos brinrestrpos brinbuild brinbuildempty brinbulkdelete brinvacuumcleanup - brincostestimate brinoptions )); +#define BRIN_AM_OID 3580 #endif /* PG_AM_H */ diff --git a/src/include/catalog/pg_amop.h b/src/include/catalog/pg_amop.h index 3ef5a49cc9b..e72cc6c093a 100644 --- a/src/include/catalog/pg_amop.h +++ b/src/include/catalog/pg_amop.h @@ -845,4 +845,168 @@ DATA(insert ( 3550 869 869 25 s 932 783 0 )); DATA(insert ( 3550 869 869 26 s 933 783 0 )); DATA(insert ( 3550 869 869 27 s 934 783 0 )); +/* BRIN opclasses */ +/* minmax bytea */ +DATA(insert ( 4064 17 17 1 s 1957 3580 0 )); +DATA(insert ( 4064 17 17 2 s 1958 3580 0 )); +DATA(insert ( 4064 17 17 3 s 1955 3580 0 )); +DATA(insert ( 4064 17 17 4 s 1960 3580 0 )); +DATA(insert ( 4064 17 17 5 s 1959 3580 0 )); +/* minmax "char" */ +DATA(insert ( 4062 18 18 1 s 631 3580 0 )); +DATA(insert ( 4062 18 18 2 s 632 3580 0 )); +DATA(insert ( 4062 18 18 3 s 92 3580 0 )); +DATA(insert ( 4062 18 18 4 s 634 3580 0 )); +DATA(insert ( 4062 18 18 5 s 633 3580 0 )); +/* minmax name */ +DATA(insert ( 4065 19 19 1 s 660 3580 0 )); +DATA(insert ( 4065 19 19 2 s 661 3580 0 )); +DATA(insert ( 4065 19 19 3 s 93 3580 0 )); +DATA(insert ( 4065 19 19 4 s 663 3580 0 )); +DATA(insert ( 4065 19 19 5 s 662 3580 0 )); +/* minmax bigint */ +DATA(insert ( 4063 20 20 1 s 412 3580 0 )); +DATA(insert ( 4063 20 20 2 s 414 3580 0 )); +DATA(insert ( 4063 20 20 3 s 410 3580 0 )); +DATA(insert ( 4063 20 20 4 s 415 3580 0 )); +DATA(insert ( 4063 20 20 5 s 413 3580 0 )); +/* minmax smallint */ +DATA(insert ( 4067 21 21 1 s 95 3580 0 )); +DATA(insert ( 4067 21 21 2 s 522 3580 0 )); +DATA(insert ( 4067 21 21 3 s 94 3580 0 )); +DATA(insert ( 4067 21 21 4 s 524 3580 0 )); +DATA(insert ( 4067 21 21 5 s 520 3580 0 )); +/* minmax integer */ +DATA(insert ( 4054 23 23 1 s 97 3580 0 )); +DATA(insert ( 4054 23 23 2 s 523 3580 0 )); +DATA(insert ( 4054 23 23 3 s 96 3580 0 )); +DATA(insert ( 4054 23 23 4 s 525 3580 0 )); +DATA(insert ( 4054 23 23 5 s 521 3580 0 )); +/* minmax text */ +DATA(insert ( 4056 25 25 1 s 664 3580 0 )); +DATA(insert ( 4056 25 25 2 s 665 3580 0 )); +DATA(insert ( 4056 25 25 3 s 98 3580 0 )); +DATA(insert ( 4056 25 25 4 s 667 3580 0 )); +DATA(insert ( 4056 25 25 5 s 666 3580 0 )); +/* minmax oid */ +DATA(insert ( 4068 26 26 1 s 609 3580 0 )); +DATA(insert ( 4068 26 26 2 s 611 3580 0 )); +DATA(insert ( 4068 26 26 3 s 607 3580 0 )); +DATA(insert ( 4068 26 26 4 s 612 3580 0 )); +DATA(insert ( 4068 26 26 5 s 610 3580 0 )); +/* minmax tid */ +DATA(insert ( 4069 27 27 1 s 2799 3580 0 )); +DATA(insert ( 4069 27 27 2 s 2801 3580 0 )); +DATA(insert ( 4069 27 27 3 s 387 3580 0 )); +DATA(insert ( 4069 27 27 4 s 2802 3580 0 )); +DATA(insert ( 4069 27 27 5 s 2800 3580 0 )); +/* minmax real */ +DATA(insert ( 4070 700 700 1 s 622 3580 0 )); +DATA(insert ( 4070 700 700 2 s 624 3580 0 )); +DATA(insert ( 4070 700 700 3 s 620 3580 0 )); +DATA(insert ( 4070 700 700 4 s 625 3580 0 )); +DATA(insert ( 4070 700 700 5 s 623 3580 0 )); +/* minmax double precision */ +DATA(insert ( 4071 701 701 1 s 672 3580 0 )); +DATA(insert ( 4071 701 701 2 s 673 3580 0 )); +DATA(insert ( 4071 701 701 3 s 670 3580 0 )); +DATA(insert ( 4071 701 701 4 s 675 3580 0 )); +DATA(insert ( 4071 701 701 5 s 674 3580 0 )); +/* minmax abstime */ +DATA(insert ( 4072 702 702 1 s 562 3580 0 )); +DATA(insert ( 4072 702 702 2 s 564 3580 0 )); +DATA(insert ( 4072 702 702 3 s 560 3580 0 )); +DATA(insert ( 4072 702 702 4 s 565 3580 0 )); +DATA(insert ( 4072 702 702 5 s 563 3580 0 )); +/* minmax reltime */ +DATA(insert ( 4073 703 703 1 s 568 3580 0 )); +DATA(insert ( 4073 703 703 2 s 570 3580 0 )); +DATA(insert ( 4073 703 703 3 s 566 3580 0 )); +DATA(insert ( 4073 703 703 4 s 571 3580 0 )); +DATA(insert ( 4073 703 703 5 s 569 3580 0 )); +/* minmax macaddr */ +DATA(insert ( 4074 829 829 1 s 1222 3580 0 )); +DATA(insert ( 4074 829 829 2 s 1223 3580 0 )); +DATA(insert ( 4074 829 829 3 s 1220 3580 0 )); +DATA(insert ( 4074 829 829 4 s 1225 3580 0 )); +DATA(insert ( 4074 829 829 5 s 1224 3580 0 )); +/* minmax inet */ +DATA(insert ( 4075 869 869 1 s 1203 3580 0 )); +DATA(insert ( 4075 869 869 2 s 1204 3580 0 )); +DATA(insert ( 4075 869 869 3 s 1201 3580 0 )); +DATA(insert ( 4075 869 869 4 s 1206 3580 0 )); +DATA(insert ( 4075 869 869 5 s 1205 3580 0 )); +/* minmax character */ +DATA(insert ( 4076 1042 1042 1 s 1058 3580 0 )); +DATA(insert ( 4076 1042 1042 2 s 1059 3580 0 )); +DATA(insert ( 4076 1042 1042 3 s 1054 3580 0 )); +DATA(insert ( 4076 1042 1042 4 s 1061 3580 0 )); +DATA(insert ( 4076 1042 1042 5 s 1060 3580 0 )); +/* minmax date */ +DATA(insert ( 4061 1082 1082 1 s 1095 3580 0 )); +DATA(insert ( 4061 1082 1082 2 s 1096 3580 0 )); +DATA(insert ( 4061 1082 1082 3 s 1093 3580 0 )); +DATA(insert ( 4061 1082 1082 4 s 1098 3580 0 )); +DATA(insert ( 4061 1082 1082 5 s 1097 3580 0 )); +/* minmax time without time zone */ +DATA(insert ( 4077 1083 1083 1 s 1110 3580 0 )); +DATA(insert ( 4077 1083 1083 2 s 1111 3580 0 )); +DATA(insert ( 4077 1083 1083 3 s 1108 3580 0 )); +DATA(insert ( 4077 1083 1083 4 s 1113 3580 0 )); +DATA(insert ( 4077 1083 1083 5 s 1112 3580 0 )); +/* minmax timestamp without time zone */ +DATA(insert ( 4059 1114 1114 1 s 2062 3580 0 )); +DATA(insert ( 4059 1114 1114 2 s 2063 3580 0 )); +DATA(insert ( 4059 1114 1114 3 s 2060 3580 0 )); +DATA(insert ( 4059 1114 1114 4 s 2065 3580 0 )); +DATA(insert ( 4059 1114 1114 5 s 2064 3580 0 )); +/* minmax timestamp with time zone */ +DATA(insert ( 4060 1184 1184 1 s 1322 3580 0 )); +DATA(insert ( 4060 1184 1184 2 s 1323 3580 0 )); +DATA(insert ( 4060 1184 1184 3 s 1320 3580 0 )); +DATA(insert ( 4060 1184 1184 4 s 1325 3580 0 )); +DATA(insert ( 4060 1184 1184 5 s 1324 3580 0 )); +/* minmax interval */ +DATA(insert ( 4078 1186 1186 1 s 1332 3580 0 )); +DATA(insert ( 4078 1186 1186 2 s 1333 3580 0 )); +DATA(insert ( 4078 1186 1186 3 s 1330 3580 0 )); +DATA(insert ( 4078 1186 1186 4 s 1335 3580 0 )); +DATA(insert ( 4078 1186 1186 5 s 1334 3580 0 )); +/* minmax time with time zone */ +DATA(insert ( 4058 1266 1266 1 s 1552 3580 0 )); +DATA(insert ( 4058 1266 1266 2 s 1553 3580 0 )); +DATA(insert ( 4058 1266 1266 3 s 1550 3580 0 )); +DATA(insert ( 4058 1266 1266 4 s 1555 3580 0 )); +DATA(insert ( 4058 1266 1266 5 s 1554 3580 0 )); +/* minmax bit */ +DATA(insert ( 4079 1560 1560 1 s 1786 3580 0 )); +DATA(insert ( 4079 1560 1560 2 s 1788 3580 0 )); +DATA(insert ( 4079 1560 1560 3 s 1784 3580 0 )); +DATA(insert ( 4079 1560 1560 4 s 1789 3580 0 )); +DATA(insert ( 4079 1560 1560 5 s 1787 3580 0 )); +/* minmax bit varying */ +DATA(insert ( 4080 1562 1562 1 s 1806 3580 0 )); +DATA(insert ( 4080 1562 1562 2 s 1808 3580 0 )); +DATA(insert ( 4080 1562 1562 3 s 1804 3580 0 )); +DATA(insert ( 4080 1562 1562 4 s 1809 3580 0 )); +DATA(insert ( 4080 1562 1562 5 s 1807 3580 0 )); +/* minmax numeric */ +DATA(insert ( 4055 1700 1700 1 s 1754 3580 0 )); +DATA(insert ( 4055 1700 1700 2 s 1755 3580 0 )); +DATA(insert ( 4055 1700 1700 3 s 1752 3580 0 )); +DATA(insert ( 4055 1700 1700 4 s 1757 3580 0 )); +DATA(insert ( 4055 1700 1700 5 s 1756 3580 0 )); +/* minmax uuid */ +DATA(insert ( 4081 2950 2950 1 s 2974 3580 0 )); +DATA(insert ( 4081 2950 2950 2 s 2976 3580 0 )); +DATA(insert ( 4081 2950 2950 3 s 2972 3580 0 )); +DATA(insert ( 4081 2950 2950 4 s 2977 3580 0 )); +DATA(insert ( 4081 2950 2950 5 s 2975 3580 0 )); +/* minmax pg_lsn */ +DATA(insert ( 4082 3220 3220 1 s 3224 3580 0 )); +DATA(insert ( 4082 3220 3220 2 s 3226 3580 0 )); +DATA(insert ( 4082 3220 3220 3 s 3222 3580 0 )); +DATA(insert ( 4082 3220 3220 4 s 3227 3580 0 )); +DATA(insert ( 4082 3220 3220 5 s 3225 3580 0 )); + #endif /* PG_AMOP_H */ diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index a1de3363e6f..e09f5578d79 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -432,4 +432,249 @@ DATA(insert ( 4017 25 25 3 4029 )); DATA(insert ( 4017 25 25 4 4030 )); DATA(insert ( 4017 25 25 5 4031 )); +/* BRIN opclasses */ +/* minmax bytea */ +DATA(insert ( 4064 17 17 1 3383 )); +DATA(insert ( 4064 17 17 2 3384 )); +DATA(insert ( 4064 17 17 3 3385 )); +DATA(insert ( 4064 17 17 4 3386 )); +DATA(insert ( 4064 17 17 11 1949 )); +DATA(insert ( 4064 17 17 12 1950 )); +DATA(insert ( 4064 17 17 13 1952 )); +DATA(insert ( 4064 17 17 14 1951 )); +/* minmax "char" */ +DATA(insert ( 4062 18 18 1 3383 )); +DATA(insert ( 4062 18 18 2 3384 )); +DATA(insert ( 4062 18 18 3 3385 )); +DATA(insert ( 4062 18 18 4 3386 )); +DATA(insert ( 4062 18 18 11 1246 )); +DATA(insert ( 4062 18 18 12 72 )); +DATA(insert ( 4062 18 18 13 74 )); +DATA(insert ( 4062 18 18 14 73 )); +/* minmax name */ +DATA(insert ( 4065 19 19 1 3383 )); +DATA(insert ( 4065 19 19 2 3384 )); +DATA(insert ( 4065 19 19 3 3385 )); +DATA(insert ( 4065 19 19 4 3386 )); +DATA(insert ( 4065 19 19 11 655 )); +DATA(insert ( 4065 19 19 12 656 )); +DATA(insert ( 4065 19 19 13 658 )); +DATA(insert ( 4065 19 19 14 657 )); +/* minmax bigint */ +DATA(insert ( 4063 20 20 1 3383 )); +DATA(insert ( 4063 20 20 2 3384 )); +DATA(insert ( 4063 20 20 3 3385 )); +DATA(insert ( 4063 20 20 4 3386 )); +DATA(insert ( 4063 20 20 11 469 )); +DATA(insert ( 4063 20 20 12 471 )); +DATA(insert ( 4063 20 20 13 472 )); +DATA(insert ( 4063 20 20 14 470 )); +/* minmax smallint */ +DATA(insert ( 4067 21 21 1 3383 )); +DATA(insert ( 4067 21 21 2 3384 )); +DATA(insert ( 4067 21 21 3 3385 )); +DATA(insert ( 4067 21 21 4 3386 )); +DATA(insert ( 4067 21 21 11 64 )); +DATA(insert ( 4067 21 21 12 148 )); +DATA(insert ( 4067 21 21 13 151 )); +DATA(insert ( 4067 21 21 14 146 )); +/* minmax integer */ +DATA(insert ( 4054 23 23 1 3383 )); +DATA(insert ( 4054 23 23 2 3384 )); +DATA(insert ( 4054 23 23 3 3385 )); +DATA(insert ( 4054 23 23 4 3386 )); +DATA(insert ( 4054 23 23 11 66 )); +DATA(insert ( 4054 23 23 12 149 )); +DATA(insert ( 4054 23 23 13 150 )); +DATA(insert ( 4054 23 23 14 147 )); +/* minmax text */ +DATA(insert ( 4056 25 25 1 3383 )); +DATA(insert ( 4056 25 25 2 3384 )); +DATA(insert ( 4056 25 25 3 3385 )); +DATA(insert ( 4056 25 25 4 3386 )); +DATA(insert ( 4056 25 25 11 740 )); +DATA(insert ( 4056 25 25 12 741 )); +DATA(insert ( 4056 25 25 13 743 )); +DATA(insert ( 4056 25 25 14 742 )); +/* minmax oid */ +DATA(insert ( 4068 26 26 1 3383 )); +DATA(insert ( 4068 26 26 2 3384 )); +DATA(insert ( 4068 26 26 3 3385 )); +DATA(insert ( 4068 26 26 4 3386 )); +DATA(insert ( 4068 26 26 11 716 )); +DATA(insert ( 4068 26 26 12 717 )); +DATA(insert ( 4068 26 26 13 1639 )); +DATA(insert ( 4068 26 26 14 1638 )); +/* minmax tid */ +DATA(insert ( 4069 27 27 1 3383 )); +DATA(insert ( 4069 27 27 2 3384 )); +DATA(insert ( 4069 27 27 3 3385 )); +DATA(insert ( 4069 27 27 4 3386 )); +DATA(insert ( 4069 27 27 11 2791 )); +DATA(insert ( 4069 27 27 12 2793 )); +DATA(insert ( 4069 27 27 13 2792 )); +DATA(insert ( 4069 27 27 14 2790 )); +/* minmax real */ +DATA(insert ( 4070 700 700 1 3383 )); +DATA(insert ( 4070 700 700 2 3384 )); +DATA(insert ( 4070 700 700 3 3385 )); +DATA(insert ( 4070 700 700 4 3386 )); +DATA(insert ( 4070 700 700 11 289 )); +DATA(insert ( 4070 700 700 12 290 )); +DATA(insert ( 4070 700 700 13 292 )); +DATA(insert ( 4070 700 700 14 291 )); +/* minmax double precision */ +DATA(insert ( 4071 701 701 1 3383 )); +DATA(insert ( 4071 701 701 2 3384 )); +DATA(insert ( 4071 701 701 3 3385 )); +DATA(insert ( 4071 701 701 4 3386 )); +DATA(insert ( 4071 701 701 11 295 )); +DATA(insert ( 4071 701 701 12 296 )); +DATA(insert ( 4071 701 701 13 298 )); +DATA(insert ( 4071 701 701 14 297 )); +/* minmax abstime */ +DATA(insert ( 4072 702 702 1 3383 )); +DATA(insert ( 4072 702 702 2 3384 )); +DATA(insert ( 4072 702 702 3 3385 )); +DATA(insert ( 4072 702 702 4 3386 )); +DATA(insert ( 4072 702 702 11 253 )); +DATA(insert ( 4072 702 702 12 255 )); +DATA(insert ( 4072 702 702 13 256 )); +DATA(insert ( 4072 702 702 14 254 )); +/* minmax reltime */ +DATA(insert ( 4073 703 703 1 3383 )); +DATA(insert ( 4073 703 703 2 3384 )); +DATA(insert ( 4073 703 703 3 3385 )); +DATA(insert ( 4073 703 703 4 3386 )); +DATA(insert ( 4073 703 703 11 259 )); +DATA(insert ( 4073 703 703 12 261 )); +DATA(insert ( 4073 703 703 13 262 )); +DATA(insert ( 4073 703 703 14 260 )); +/* minmax macaddr */ +DATA(insert ( 4074 829 829 1 3383 )); +DATA(insert ( 4074 829 829 2 3384 )); +DATA(insert ( 4074 829 829 3 3385 )); +DATA(insert ( 4074 829 829 4 3386 )); +DATA(insert ( 4074 829 829 11 831 )); +DATA(insert ( 4074 829 829 12 832 )); +DATA(insert ( 4074 829 829 13 834 )); +DATA(insert ( 4074 829 829 14 833 )); +/* minmax inet */ +DATA(insert ( 4075 869 869 1 3383 )); +DATA(insert ( 4075 869 869 2 3384 )); +DATA(insert ( 4075 869 869 3 3385 )); +DATA(insert ( 4075 869 869 4 3386 )); +DATA(insert ( 4075 869 869 11 921 )); +DATA(insert ( 4075 869 869 12 922 )); +DATA(insert ( 4075 869 869 13 924 )); +DATA(insert ( 4075 869 869 14 923 )); +/* minmax character */ +DATA(insert ( 4076 1042 1042 1 3383 )); +DATA(insert ( 4076 1042 1042 2 3384 )); +DATA(insert ( 4076 1042 1042 3 3385 )); +DATA(insert ( 4076 1042 1042 4 3386 )); +DATA(insert ( 4076 1042 1042 11 1049 )); +DATA(insert ( 4076 1042 1042 12 1050 )); +DATA(insert ( 4076 1042 1042 13 1052 )); +DATA(insert ( 4076 1042 1042 14 1051 )); +/* minmax date */ +DATA(insert ( 4061 1082 1082 1 3383 )); +DATA(insert ( 4061 1082 1082 2 3384 )); +DATA(insert ( 4061 1082 1082 3 3385 )); +DATA(insert ( 4061 1082 1082 4 3386 )); +DATA(insert ( 4061 1082 1082 11 1087 )); +DATA(insert ( 4061 1082 1082 12 1088 )); +DATA(insert ( 4061 1082 1082 13 1090 )); +DATA(insert ( 4061 1082 1082 14 1089 )); +/* minmax time without time zone */ +DATA(insert ( 4077 1083 1083 1 3383 )); +DATA(insert ( 4077 1083 1083 2 3384 )); +DATA(insert ( 4077 1083 1083 3 3385 )); +DATA(insert ( 4077 1083 1083 4 3386 )); +DATA(insert ( 4077 1083 1083 11 1102 )); +DATA(insert ( 4077 1083 1083 12 1103 )); +DATA(insert ( 4077 1083 1083 13 1105 )); +DATA(insert ( 4077 1083 1083 14 1104 )); +/* minmax timestamp without time zone */ +DATA(insert ( 4059 1114 1114 1 3383 )); +DATA(insert ( 4059 1114 1114 2 3384 )); +DATA(insert ( 4059 1114 1114 3 3385 )); +DATA(insert ( 4059 1114 1114 4 3386 )); +DATA(insert ( 4059 1114 1114 11 2054 )); +DATA(insert ( 4059 1114 1114 12 2055 )); +DATA(insert ( 4059 1114 1114 13 2056 )); +DATA(insert ( 4059 1114 1114 14 2057 )); +/* minmax timestamp with time zone */ +DATA(insert ( 4060 1184 1184 1 3383 )); +DATA(insert ( 4060 1184 1184 2 3384 )); +DATA(insert ( 4060 1184 1184 3 3385 )); +DATA(insert ( 4060 1184 1184 4 3386 )); +DATA(insert ( 4060 1184 1184 11 1154 )); +DATA(insert ( 4060 1184 1184 12 1155 )); +DATA(insert ( 4060 1184 1184 13 1156 )); +DATA(insert ( 4060 1184 1184 14 1157 )); +/* minmax interval */ +DATA(insert ( 4078 1186 1186 1 3383 )); +DATA(insert ( 4078 1186 1186 2 3384 )); +DATA(insert ( 4078 1186 1186 3 3385 )); +DATA(insert ( 4078 1186 1186 4 3386 )); +DATA(insert ( 4078 1186 1186 11 1164 )); +DATA(insert ( 4078 1186 1186 12 1165 )); +DATA(insert ( 4078 1186 1186 13 1166 )); +DATA(insert ( 4078 1186 1186 14 1167 )); +/* minmax time with time zone */ +DATA(insert ( 4058 1266 1266 1 3383 )); +DATA(insert ( 4058 1266 1266 2 3384 )); +DATA(insert ( 4058 1266 1266 3 3385 )); +DATA(insert ( 4058 1266 1266 4 3386 )); +DATA(insert ( 4058 1266 1266 11 1354 )); +DATA(insert ( 4058 1266 1266 12 1355 )); +DATA(insert ( 4058 1266 1266 13 1356 )); +DATA(insert ( 4058 1266 1266 14 1357 )); +/* minmax bit */ +DATA(insert ( 4079 1560 1560 1 3383 )); +DATA(insert ( 4079 1560 1560 2 3384 )); +DATA(insert ( 4079 1560 1560 3 3385 )); +DATA(insert ( 4079 1560 1560 4 3386 )); +DATA(insert ( 4079 1560 1560 11 1595 )); +DATA(insert ( 4079 1560 1560 12 1594 )); +DATA(insert ( 4079 1560 1560 13 1592 )); +DATA(insert ( 4079 1560 1560 14 1593 )); +/* minmax bit varying */ +DATA(insert ( 4080 1562 1562 1 3383 )); +DATA(insert ( 4080 1562 1562 2 3384 )); +DATA(insert ( 4080 1562 1562 3 3385 )); +DATA(insert ( 4080 1562 1562 4 3386 )); +DATA(insert ( 4080 1562 1562 11 1671 )); +DATA(insert ( 4080 1562 1562 12 1670 )); +DATA(insert ( 4080 1562 1562 13 1668 )); +DATA(insert ( 4080 1562 1562 14 1669 )); +/* minmax numeric */ +DATA(insert ( 4055 1700 1700 1 3383 )); +DATA(insert ( 4055 1700 1700 2 3384 )); +DATA(insert ( 4055 1700 1700 3 3385 )); +DATA(insert ( 4055 1700 1700 4 3386 )); +DATA(insert ( 4055 1700 1700 11 1722 )); +DATA(insert ( 4055 1700 1700 12 1723 )); +DATA(insert ( 4055 1700 1700 13 1721 )); +DATA(insert ( 4055 1700 1700 14 1720 )); +/* minmax uuid */ +DATA(insert ( 4081 2950 2950 1 3383 )); +DATA(insert ( 4081 2950 2950 2 3384 )); +DATA(insert ( 4081 2950 2950 3 3385 )); +DATA(insert ( 4081 2950 2950 4 3386 )); +DATA(insert ( 4081 2950 2950 11 2954 )); +DATA(insert ( 4081 2950 2950 12 2955 )); +DATA(insert ( 4081 2950 2950 13 2957 )); +DATA(insert ( 4081 2950 2950 14 2958 )); +/* minmax pg_lsn */ +DATA(insert ( 4082 3220 3220 1 3383 )); +DATA(insert ( 4082 3220 3220 2 3384 )); +DATA(insert ( 4082 3220 3220 3 3385 )); +DATA(insert ( 4082 3220 3220 4 3386 )); +DATA(insert ( 4082 3220 3220 11 3231 )); +DATA(insert ( 4082 3220 3220 12 3232 )); +DATA(insert ( 4082 3220 3220 13 3234 )); +DATA(insert ( 4082 3220 3220 14 3235 )); + #endif /* PG_AMPROC_H */ diff --git a/src/include/catalog/pg_opclass.h b/src/include/catalog/pg_opclass.h index dc523416c92..595cd7f4879 100644 --- a/src/include/catalog/pg_opclass.h +++ b/src/include/catalog/pg_opclass.h @@ -236,4 +236,36 @@ DATA(insert ( 405 jsonb_ops PGNSP PGUID 4034 3802 t 0 )); DATA(insert ( 2742 jsonb_ops PGNSP PGUID 4036 3802 t 25 )); DATA(insert ( 2742 jsonb_path_ops PGNSP PGUID 4037 3802 f 23 )); +/* BRIN operator classes */ +/* no brin opclass for bool */ +DATA(insert ( 3580 bytea_minmax_ops PGNSP PGUID 4064 17 t 0 )); +DATA(insert ( 3580 char_minmax_ops PGNSP PGUID 4062 18 t 0 )); +DATA(insert ( 3580 name_minmax_ops PGNSP PGUID 4065 19 t 0 )); +DATA(insert ( 3580 int8_minmax_ops PGNSP PGUID 4063 20 t 0 )); +DATA(insert ( 3580 int2_minmax_ops PGNSP PGUID 4067 21 t 0 )); +DATA(insert ( 3580 int4_minmax_ops PGNSP PGUID 4054 23 t 0 )); +DATA(insert ( 3580 text_minmax_ops PGNSP PGUID 4056 25 t 0 )); +DATA(insert ( 3580 oid_minmax_ops PGNSP PGUID 4068 26 t 0 )); +DATA(insert ( 3580 tid_minmax_ops PGNSP PGUID 4069 27 t 0 )); +DATA(insert ( 3580 float4_minmax_ops PGNSP PGUID 4070 700 t 0 )); +DATA(insert ( 3580 float8_minmax_ops PGNSP PGUID 4071 701 t 0 )); +DATA(insert ( 3580 abstime_minmax_ops PGNSP PGUID 4072 702 t 0 )); +DATA(insert ( 3580 reltime_minmax_ops PGNSP PGUID 4073 703 t 0 )); +DATA(insert ( 3580 macaddr_minmax_ops PGNSP PGUID 4074 829 t 0 )); +DATA(insert ( 3580 inet_minmax_ops PGNSP PGUID 4075 869 t 0 )); +DATA(insert ( 3580 bpchar_minmax_ops PGNSP PGUID 4076 1042 t 0 )); +DATA(insert ( 3580 date_minmax_ops PGNSP PGUID 4061 1082 t 0 )); +DATA(insert ( 3580 time_minmax_ops PGNSP PGUID 4077 1083 t 0 )); +DATA(insert ( 3580 timestamp_minmax_ops PGNSP PGUID 4059 1114 t 0 )); +DATA(insert ( 3580 timestamptz_minmax_ops PGNSP PGUID 4060 1184 t 0 )); +DATA(insert ( 3580 interval_minmax_ops PGNSP PGUID 4078 1186 t 0 )); +DATA(insert ( 3580 timetz_minmax_ops PGNSP PGUID 4058 1266 t 0 )); +DATA(insert ( 3580 bit_minmax_ops PGNSP PGUID 4079 1560 t 0 )); +DATA(insert ( 3580 varbit_minmax_ops PGNSP PGUID 4080 1562 t 0 )); +DATA(insert ( 3580 numeric_minmax_ops PGNSP PGUID 4055 1700 t 0 )); +/* no brin opclass for record, anyarray */ +DATA(insert ( 3580 uuid_minmax_ops PGNSP PGUID 4081 2950 t 0 )); +DATA(insert ( 3580 pg_lsn_minmax_ops PGNSP PGUID 4082 3220 t 0 )); +/* no brin opclass for enum, tsvector, tsquery, jsonb, range */ + #endif /* PG_OPCLASS_H */ diff --git a/src/include/catalog/pg_opfamily.h b/src/include/catalog/pg_opfamily.h index 26297ced0da..2d8af766025 100644 --- a/src/include/catalog/pg_opfamily.h +++ b/src/include/catalog/pg_opfamily.h @@ -157,4 +157,32 @@ DATA(insert OID = 4035 ( 783 jsonb_ops PGNSP PGUID )); DATA(insert OID = 4036 ( 2742 jsonb_ops PGNSP PGUID )); DATA(insert OID = 4037 ( 2742 jsonb_path_ops PGNSP PGUID )); +DATA(insert OID = 4054 ( 3580 int4_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4055 ( 3580 numeric_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4056 ( 3580 text_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4058 ( 3580 timetz_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4059 ( 3580 timestamp_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4060 ( 3580 timestamptz_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4061 ( 3580 date_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4062 ( 3580 char_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4063 ( 3580 int8_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4064 ( 3580 bytea_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4065 ( 3580 name_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4067 ( 3580 int2_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4068 ( 3580 oid_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4069 ( 3580 tid_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4070 ( 3580 float4_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4071 ( 3580 float8_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4072 ( 3580 abstime_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4073 ( 3580 reltime_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4074 ( 3580 macaddr_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4075 ( 3580 inet_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4076 ( 3580 bpchar_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4077 ( 3580 time_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4078 ( 3580 interval_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4079 ( 3580 bit_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4080 ( 3580 varbit_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4081 ( 3580 uuid_minmax_ops PGNSP PGUID )); +DATA(insert OID = 4082 ( 3580 pg_lsn_minmax_ops PGNSP PGUID )); + #endif /* PG_OPFAMILY_H */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index b6dc1b82adb..497e652674b 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -565,6 +565,35 @@ DESCR("btree(internal)"); DATA(insert OID = 2785 ( btoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ btoptions _null_ _null_ _null_ )); DESCR("btree(internal)"); +DATA(insert OID = 3789 ( bringetbitmap PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_ bringetbitmap _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3790 ( brininsert PGNSP PGUID 12 1 0 0 0 f f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brininsert _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3791 ( brinbeginscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbeginscan _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3792 ( brinrescan PGNSP PGUID 12 1 0 0 0 f f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinrescan _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3793 ( brinendscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinendscan _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3794 ( brinmarkpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinmarkpos _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3795 ( brinrestrpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinrestrpos _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3796 ( brinbuild PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbuild _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3797 ( brinbuildempty PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinbuildempty _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3798 ( brinbulkdelete PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinbulkdelete _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3799 ( brinvacuumcleanup PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ brinvacuumcleanup _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3800 ( brincostestimate PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brincostestimate _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3801 ( brinoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ brinoptions _null_ _null_ _null_ )); +DESCR("brin(internal)"); +DATA(insert OID = 3952 ( brin_summarize_new_values PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 23 "2205" _null_ _null_ _null_ _null_ brin_summarize_new_values _null_ _null_ _null_ )); +DESCR("brin: standalone scan new table pages"); + DATA(insert OID = 339 ( poly_same PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ )); DATA(insert OID = 340 ( poly_contain PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ )); DATA(insert OID = 341 ( poly_left PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_left _null_ _null_ _null_ )); @@ -4078,6 +4107,16 @@ DATA(insert OID = 2747 ( arrayoverlap PGNSP PGUID 12 1 0 0 0 f f f f t f i DATA(insert OID = 2748 ( arraycontains PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontains _null_ _null_ _null_ )); DATA(insert OID = 2749 ( arraycontained PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontained _null_ _null_ _null_ )); +/* BRIN minmax */ +DATA(insert OID = 3383 ( brin_minmax_opcinfo PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2281 "2281" _null_ _null_ _null_ _null_ minmaxOpcInfo _null_ _null_ _null_ )); +DESCR("BRIN minmax support"); +DATA(insert OID = 3384 ( brin_minmax_add_value PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 16 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ minmaxAddValue _null_ _null_ _null_ )); +DESCR("BRIN minmax support"); +DATA(insert OID = 3385 ( brin_minmax_consistent PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxConsistent _null_ _null_ _null_ )); +DESCR("BRIN minmax support"); +DATA(insert OID = 3386 ( brin_minmax_union PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxUnion _null_ _null_ _null_ )); +DESCR("BRIN minmax support"); + /* userlock replacements */ DATA(insert OID = 2880 ( pg_advisory_lock PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "20" _null_ _null_ _null_ _null_ pg_advisory_lock_int8 _null_ _null_ _null_ )); DESCR("obtain exclusive advisory lock"); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index d96e375f3f5..db7075f387b 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -403,6 +403,8 @@ extern Size PageGetExactFreeSpace(Page page); extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); +extern void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, + int nitems); extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); extern void PageSetChecksumInplace(Page page, BlockNumber blkno); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 0f662ec8bb4..25cb3fa85fe 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -190,6 +190,7 @@ extern double estimate_num_groups(PlannerInfo *root, List *groupExprs, extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets); +extern Datum brincostestimate(PG_FUNCTION_ARGS); extern Datum btcostestimate(PG_FUNCTION_ARGS); extern Datum hashcostestimate(PG_FUNCTION_ARGS); extern Datum gistcostestimate(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out new file mode 100644 index 00000000000..f8be27e5e30 --- /dev/null +++ b/src/test/regress/expected/brin.out @@ -0,0 +1,179 @@ +SET synchronous_commit = 0; +CREATE TABLE brintest (byteacol bytea, + charcol "char", + namecol name, + int8col bigint, + int2col smallint, + int4col integer, + textcol text, + oidcol oid, + tidcol tid, + float4col real, + float8col double precision, + macaddrcol macaddr, + inetcol inet, + bpcharcol character, + datecol date, + timecol time without time zone, + timestampcol timestamp without time zone, + timestamptzcol timestamp with time zone, + intervalcol interval, + timetzcol time with time zone, + bitcol bit(10), + varbitcol bit varying(16), + numericcol numeric, + uuidcol uuid, + lsncol pg_lsn +) WITH (fillfactor=50); +INSERT INTO brintest SELECT + repeat(stringu1, 42)::bytea, + substr(stringu1, 1, 1)::"char", + stringu1::name, 142857 * tenthous, + thousand, + twothousand, + repeat(stringu1, 42), + unique1::oid, + format('(%s,%s)', tenthous, twenty)::tid, + (four + 1.0)/(hundred+1), + odd::float8 / (tenthous + 1), + format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, + inet '10.2.3.4' + tenthous, + substr(stringu1, 1, 1)::bpchar, + date '1995-08-15' + tenthous, + time '01:20:30' + thousand * interval '18.5 second', + timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', + timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', + justify_days(justify_hours(tenthous * interval '12 minutes')), + timetz '01:30:20' + hundred * interval '15 seconds', + thousand::bit(10), + tenthous::bit(16)::varbit, + tenthous::numeric(36,30) * fivethous * even / (hundred + 1), + format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, + format('%s/%s%s', odd, even, tenthous)::pg_lsn +FROM tenk1; +CREATE INDEX brinidx ON brintest USING brin ( + byteacol, + charcol, + namecol, + int8col, + int2col, + int4col, + textcol, + oidcol, + tidcol, + float4col, + float8col, + macaddrcol, + inetcol, + bpcharcol, + datecol, + timecol, + timestampcol, + timestamptzcol, + intervalcol, + timetzcol, + bitcol, + varbitcol, + numericcol, + uuidcol, + lsncol +) with (pages_per_range = 1); +CREATE TABLE brinopers (colname name, op text[], value text[], + check (cardinality(op) = cardinality(value))); +INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); +INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); +INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); +INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}'); +INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}'); +INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}'); +INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}'); +INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); +INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}'); +INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}'); +INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}'); +INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}'); +INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}'); +INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}'); +INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}'); +INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}'); +INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}'); +INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}'); +INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}'); +INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}'); +DO $x$ +DECLARE + r record; + tabname text; + tabname_ss text; + count int; + query text; + plan text; +BEGIN + FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP + tabname := format('qry_%s', r.row_number); + tabname_ss := tabname || '_ss'; + query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, + tabname, r.colname, r.oper, r.value); + -- run the query using the brin index + SET enable_seqscan = 0; + SET enable_bitmapscan = 1; + EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname); + EXECUTE query; + + -- run the query using a seqscan + SET enable_seqscan = 1; + SET enable_bitmapscan = 0; + query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, + tabname_ss, r.colname, r.oper, r.value); + EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss); + EXECUTE query; + + -- make sure both return the same results + EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss); + GET DIAGNOSTICS count = ROW_COUNT; + IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; + EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname); + GET DIAGNOSTICS count = ROW_COUNT; + IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; + end loop; +end; +$x$; +INSERT INTO brintest SELECT + repeat(stringu1, 42)::bytea, + substr(stringu1, 1, 1)::"char", + stringu1::name, 142857 * tenthous, + thousand, + twothousand, + repeat(stringu1, 42), + unique1::oid, + format('(%s,%s)', tenthous, twenty)::tid, + (four + 1.0)/(hundred+1), + odd::float8 / (tenthous + 1), + format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, + inet '10.2.3.4' + tenthous, + substr(stringu1, 1, 1)::bpchar, + date '1995-08-15' + tenthous, + time '01:20:30' + thousand * interval '18.5 second', + timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', + timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', + justify_days(justify_hours(tenthous * interval '12 minutes')), + timetz '01:30:20' + hundred * interval '15 seconds', + thousand::bit(10), + tenthous::bit(16)::varbit, + tenthous::numeric(36,30) * fivethous * even / (hundred + 1), + format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, + format('%s/%s%s', odd, even, tenthous)::pg_lsn +FROM tenk1; +SELECT brin_summarize_new_values('brinidx'::regclass); + brin_summarize_new_values +--------------------------- + 2000 +(1 row) + +UPDATE brintest SET int8col = int8col * int4col; +SET synchronous_commit = 1; diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index 992522ea3f1..9870bfaa018 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -1658,6 +1658,11 @@ ORDER BY 1, 2, 3; 2742 | 9 | ? 2742 | 10 | ?| 2742 | 11 | ?& + 3580 | 1 | < + 3580 | 2 | <= + 3580 | 3 | = + 3580 | 4 | >= + 3580 | 5 | > 4000 | 1 | << 4000 | 1 | ~<~ 4000 | 2 | &< @@ -1680,7 +1685,7 @@ ORDER BY 1, 2, 3; 4000 | 15 | > 4000 | 16 | @> 4000 | 18 | = -(80 rows) +(85 rows) -- Check that all opclass search operators have selectivity estimators. -- This is not absolutely required, but it seems a reasonable thing @@ -1842,11 +1847,13 @@ WHERE NOT ( -- GIN has six support functions. 1-3 are mandatory, 5 is optional, and -- at least one of 4 and 6 must be given. -- SP-GiST has five support functions, all mandatory + -- BRIN has four mandatory support functions, and a bunch of optionals amname = 'btree' AND procnums @> '{1}' OR amname = 'hash' AND procnums = '{1}' OR amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR - amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' + amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR + amname = 'brin' AND procnums @> '{1, 2, 3, 4}' ); amname | opfname | amproclefttype | amprocrighttype | procnums --------+---------+----------------+-----------------+---------- @@ -1867,7 +1874,8 @@ WHERE NOT ( amname = 'hash' AND procnums = '{1}' OR amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR - amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' + amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR + amname = 'brin' AND procnums @> '{1, 2, 3, 4}' ); amname | opcname | procnums --------+---------+---------- diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source index 7015bfda2c2..06606081ad3 100644 --- a/src/test/regress/output/misc.source +++ b/src/test/regress/output/misc.source @@ -591,6 +591,8 @@ SELECT user_relns() AS user_relns bb box_tbl bprime + brinopers + brintest bt_f8_heap bt_i4_heap bt_name_heap @@ -698,7 +700,7 @@ SELECT user_relns() AS user_relns tvvmv varchar_tbl xacttest -(120 rows) +(122 rows) SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer'))); name diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 9902dbeb39c..d4f02e5703a 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -83,7 +83,7 @@ test: select_into select_distinct select_distinct_on select_implicit select_havi # ---------- # Another group of parallel tests # ---------- -test: privileges security_label collate matview lock replica_identity rowsecurity +test: brin privileges security_label collate matview lock replica_identity rowsecurity # ---------- # Another group of parallel tests diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 2902a05dfb6..b1e44b3bf30 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -106,6 +106,7 @@ test: alter_generic test: misc test: psql test: async +test: brin test: rules test: event_trigger test: select_views diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql new file mode 100644 index 00000000000..244652f4c40 --- /dev/null +++ b/src/test/regress/sql/brin.sql @@ -0,0 +1,184 @@ +SET synchronous_commit = 0; + +CREATE TABLE brintest (byteacol bytea, + charcol "char", + namecol name, + int8col bigint, + int2col smallint, + int4col integer, + textcol text, + oidcol oid, + tidcol tid, + float4col real, + float8col double precision, + macaddrcol macaddr, + inetcol inet, + bpcharcol character, + datecol date, + timecol time without time zone, + timestampcol timestamp without time zone, + timestamptzcol timestamp with time zone, + intervalcol interval, + timetzcol time with time zone, + bitcol bit(10), + varbitcol bit varying(16), + numericcol numeric, + uuidcol uuid, + lsncol pg_lsn +) WITH (fillfactor=50); + +INSERT INTO brintest SELECT + repeat(stringu1, 42)::bytea, + substr(stringu1, 1, 1)::"char", + stringu1::name, 142857 * tenthous, + thousand, + twothousand, + repeat(stringu1, 42), + unique1::oid, + format('(%s,%s)', tenthous, twenty)::tid, + (four + 1.0)/(hundred+1), + odd::float8 / (tenthous + 1), + format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, + inet '10.2.3.4' + tenthous, + substr(stringu1, 1, 1)::bpchar, + date '1995-08-15' + tenthous, + time '01:20:30' + thousand * interval '18.5 second', + timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', + timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', + justify_days(justify_hours(tenthous * interval '12 minutes')), + timetz '01:30:20' + hundred * interval '15 seconds', + thousand::bit(10), + tenthous::bit(16)::varbit, + tenthous::numeric(36,30) * fivethous * even / (hundred + 1), + format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, + format('%s/%s%s', odd, even, tenthous)::pg_lsn +FROM tenk1; + +CREATE INDEX brinidx ON brintest USING brin ( + byteacol, + charcol, + namecol, + int8col, + int2col, + int4col, + textcol, + oidcol, + tidcol, + float4col, + float8col, + macaddrcol, + inetcol, + bpcharcol, + datecol, + timecol, + timestampcol, + timestamptzcol, + intervalcol, + timetzcol, + bitcol, + varbitcol, + numericcol, + uuidcol, + lsncol +) with (pages_per_range = 1); + +CREATE TABLE brinopers (colname name, op text[], value text[], + check (cardinality(op) = cardinality(value))); + +INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); +INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); +INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); +INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}'); +INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}'); +INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}'); +INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}'); +INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}'); +INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); +INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}'); +INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}'); +INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}'); +INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}'); +INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}'); +INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}'); +INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}'); +INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}'); +INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}'); +INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}'); +INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}'); +INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}'); + +DO $x$ +DECLARE + r record; + tabname text; + tabname_ss text; + count int; + query text; + plan text; +BEGIN + FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP + tabname := format('qry_%s', r.row_number); + tabname_ss := tabname || '_ss'; + query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, + tabname, r.colname, r.oper, r.value); + -- run the query using the brin index + SET enable_seqscan = 0; + SET enable_bitmapscan = 1; + EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname); + EXECUTE query; + + -- run the query using a seqscan + SET enable_seqscan = 1; + SET enable_bitmapscan = 0; + query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, + tabname_ss, r.colname, r.oper, r.value); + EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss); + EXECUTE query; + + -- make sure both return the same results + EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss); + GET DIAGNOSTICS count = ROW_COUNT; + IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; + EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname); + GET DIAGNOSTICS count = ROW_COUNT; + IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; + end loop; +end; +$x$; + +INSERT INTO brintest SELECT + repeat(stringu1, 42)::bytea, + substr(stringu1, 1, 1)::"char", + stringu1::name, 142857 * tenthous, + thousand, + twothousand, + repeat(stringu1, 42), + unique1::oid, + format('(%s,%s)', tenthous, twenty)::tid, + (four + 1.0)/(hundred+1), + odd::float8 / (tenthous + 1), + format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, + inet '10.2.3.4' + tenthous, + substr(stringu1, 1, 1)::bpchar, + date '1995-08-15' + tenthous, + time '01:20:30' + thousand * interval '18.5 second', + timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', + timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', + justify_days(justify_hours(tenthous * interval '12 minutes')), + timetz '01:30:20' + hundred * interval '15 seconds', + thousand::bit(10), + tenthous::bit(16)::varbit, + tenthous::numeric(36,30) * fivethous * even / (hundred + 1), + format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, + format('%s/%s%s', odd, even, tenthous)::pg_lsn +FROM tenk1; + +SELECT brin_summarize_new_values('brinidx'::regclass); + +UPDATE brintest SET int8col = int8col * int4col; + +SET synchronous_commit = 1; diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql index b394c300769..7159a8377ee 100644 --- a/src/test/regress/sql/opr_sanity.sql +++ b/src/test/regress/sql/opr_sanity.sql @@ -1195,11 +1195,13 @@ WHERE NOT ( -- GIN has six support functions. 1-3 are mandatory, 5 is optional, and -- at least one of 4 and 6 must be given. -- SP-GiST has five support functions, all mandatory + -- BRIN has four mandatory support functions, and a bunch of optionals amname = 'btree' AND procnums @> '{1}' OR amname = 'hash' AND procnums = '{1}' OR amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR - amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' + amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR + amname = 'brin' AND procnums @> '{1, 2, 3, 4}' ); -- Also, check if there are any pg_opclass entries that don't seem to have @@ -1218,7 +1220,8 @@ WHERE NOT ( amname = 'hash' AND procnums = '{1}' OR amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR - amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' + amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR + amname = 'brin' AND procnums @> '{1, 2, 3, 4}' ); -- Unfortunately, we can't check the amproc link very well because the |