aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtpage.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
-rw-r--r--src/backend/access/nbtree/nbtpage.c206
1 files changed, 141 insertions, 65 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 56041c3d383..37829d34321 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -33,7 +33,8 @@
#include "storage/predicate.h"
#include "utils/snapmgr.h"
-static void _bt_cachemetadata(Relation rel, BTMetaPageData *metad);
+static void _bt_cachemetadata(Relation rel, BTMetaPageData *input);
+static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
bool *rightsib_empty);
@@ -77,7 +78,9 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
}
/*
- * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
+ * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
+ * 3, the last version that can be updated without broadly affecting
+ * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
*
* This routine does purely in-memory image upgrade. Caller is
* responsible for locking, WAL-logging etc.
@@ -93,11 +96,11 @@ _bt_upgrademetapage(Page page)
/* It must be really a meta page of upgradable version */
Assert(metaopaque->btpo_flags & BTP_META);
- Assert(metad->btm_version < BTREE_VERSION);
+ Assert(metad->btm_version < BTREE_NOVAC_VERSION);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
/* Set version number and fill extra fields added into version 3 */
- metad->btm_version = BTREE_VERSION;
+ metad->btm_version = BTREE_NOVAC_VERSION;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
@@ -107,44 +110,80 @@ _bt_upgrademetapage(Page page)
}
/*
- * Cache metadata from meta page to rel->rd_amcache.
+ * Cache metadata from input meta page to rel->rd_amcache.
*/
static void
-_bt_cachemetadata(Relation rel, BTMetaPageData *metad)
+_bt_cachemetadata(Relation rel, BTMetaPageData *input)
{
+ BTMetaPageData *cached_metad;
+
/* We assume rel->rd_amcache was already freed by caller */
Assert(rel->rd_amcache == NULL);
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
sizeof(BTMetaPageData));
- /*
- * Meta page should be of supported version (should be already checked by
- * caller).
- */
- Assert(metad->btm_version >= BTREE_MIN_VERSION &&
- metad->btm_version <= BTREE_VERSION);
+ /* Meta page should be of supported version */
+ Assert(input->btm_version >= BTREE_MIN_VERSION &&
+ input->btm_version <= BTREE_VERSION);
- if (metad->btm_version == BTREE_VERSION)
+ cached_metad = (BTMetaPageData *) rel->rd_amcache;
+ if (input->btm_version >= BTREE_NOVAC_VERSION)
{
- /* Last version of meta-data, no need to upgrade */
- memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+ /* Version with compatible meta-data, no need to upgrade */
+ memcpy(cached_metad, input, sizeof(BTMetaPageData));
}
else
{
- BTMetaPageData *cached_metad = (BTMetaPageData *) rel->rd_amcache;
-
/*
* Upgrade meta-data: copy available information from meta-page and
* fill new fields with default values.
+ *
+ * Note that we cannot upgrade to version 4+ without a REINDEX, since
+ * extensive on-disk changes are required.
*/
- memcpy(rel->rd_amcache, metad, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
- cached_metad->btm_version = BTREE_VERSION;
+ memcpy(cached_metad, input, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
+ cached_metad->btm_version = BTREE_NOVAC_VERSION;
cached_metad->btm_oldest_btpo_xact = InvalidTransactionId;
cached_metad->btm_last_cleanup_num_heap_tuples = -1.0;
}
}
/*
+ * Get metadata from share-locked buffer containing metapage, while performing
+ * standard sanity checks. Sanity checks here must match _bt_getroot().
+ */
+static BTMetaPageData *
+_bt_getmeta(Relation rel, Buffer metabuf)
+{
+ Page metapg;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
+
+ /* sanity-check the metapage */
+ if (!P_ISMETA(metaopaque) ||
+ metad->btm_magic != BTREE_MAGIC)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" is not a btree",
+ RelationGetRelationName(rel))));
+
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+ return metad;
+}
+
+/*
* _bt_update_meta_cleanup_info() -- Update cleanup-related information in
* the metapage.
*
@@ -167,7 +206,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
metad = BTPageGetMeta(metapg);
/* outdated version of metapage always needs rewrite */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
needsRewrite = true;
else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
@@ -186,7 +225,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
START_CRIT_SECTION();
/* upgrade meta-page if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
/* update cleanup-related information */
@@ -202,6 +241,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
XLogBeginInsert();
XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
md.root = metad->btm_root;
md.level = metad->btm_level;
md.fastroot = metad->btm_fastroot;
@@ -376,7 +417,7 @@ _bt_getroot(Relation rel, int access)
START_CRIT_SECTION();
/* upgrade metapage if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_root = rootblkno;
@@ -400,6 +441,8 @@ _bt_getroot(Relation rel, int access)
XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
md.root = rootblkno;
md.level = 0;
md.fastroot = rootblkno;
@@ -595,37 +638,12 @@ _bt_getrootheight(Relation rel)
{
BTMetaPageData *metad;
- /*
- * We can get what we need from the cached metapage data. If it's not
- * cached yet, load it. Sanity checks here must match _bt_getroot().
- */
if (rel->rd_amcache == NULL)
{
Buffer metabuf;
- Page metapg;
- BTPageOpaque metaopaque;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
- metapg = BufferGetPage(metabuf);
- metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
- metad = BTPageGetMeta(metapg);
-
- /* sanity-check the metapage */
- if (!P_ISMETA(metaopaque) ||
- metad->btm_magic != BTREE_MAGIC)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("index \"%s\" is not a btree",
- RelationGetRelationName(rel))));
-
- if (metad->btm_version < BTREE_MIN_VERSION ||
- metad->btm_version > BTREE_VERSION)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("version mismatch in index \"%s\": file version %d, "
- "current version %d, minimal supported version %d",
- RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+ metad = _bt_getmeta(rel, metabuf);
/*
* If there's no root page yet, _bt_getroot() doesn't expect a cache
@@ -642,20 +660,71 @@ _bt_getrootheight(Relation rel)
* Cache the metapage data for next time
*/
_bt_cachemetadata(rel, metad);
-
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ Assert(metad->btm_fastroot != P_NONE);
_bt_relbuf(rel, metabuf);
}
+ /* Get cached page */
metad = (BTMetaPageData *) rel->rd_amcache;
- /* We shouldn't have cached it if any of these fail */
- Assert(metad->btm_magic == BTREE_MAGIC);
- Assert(metad->btm_version == BTREE_VERSION);
- Assert(metad->btm_fastroot != P_NONE);
return metad->btm_fastlevel;
}
/*
+ * _bt_heapkeyspace() -- is heap TID being treated as a key?
+ *
+ * This is used to determine the rules that must be used to descend a
+ * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
+ * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
+ * performance when inserting a new BTScanInsert-wise duplicate tuple
+ * among many leaf pages already full of such duplicates.
+ */
+bool
+_bt_heapkeyspace(Relation rel)
+{
+ BTMetaPageData *metad;
+
+ if (rel->rd_amcache == NULL)
+ {
+ Buffer metabuf;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metad = _bt_getmeta(rel, metabuf);
+
+ /*
+ * If there's no root page yet, _bt_getroot() doesn't expect a cache
+ * to be made, so just stop here. (XXX perhaps _bt_getroot() should
+ * be changed to allow this case.)
+ */
+ if (metad->btm_root == P_NONE)
+ {
+ uint32 btm_version = metad->btm_version;
+
+ _bt_relbuf(rel, metabuf);
+ return btm_version > BTREE_NOVAC_VERSION;
+ }
+
+ /*
+ * Cache the metapage data for next time
+ */
+ _bt_cachemetadata(rel, metad);
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ Assert(metad->btm_fastroot != P_NONE);
+ _bt_relbuf(rel, metabuf);
+ }
+
+ /* Get cached page */
+ metad = (BTMetaPageData *) rel->rd_amcache;
+
+ return metad->btm_version > BTREE_NOVAC_VERSION;
+}
+
+/*
* _bt_checkpage() -- Verify that a freshly-read page looks sane.
*/
void
@@ -1123,11 +1192,12 @@ _bt_is_page_halfdead(Relation rel, BlockNumber blk)
* right sibling.
*
* "child" is the leaf page we wish to delete, and "stack" is a search stack
- * leading to it (approximately). Note that we will update the stack
- * entry(s) to reflect current downlink positions --- this is essentially the
- * same as the corresponding step of splitting, and is not expected to affect
- * caller. The caller should initialize *target and *rightsib to the leaf
- * page and its right sibling.
+ * leading to it (it actually leads to the leftmost leaf page with a high key
+ * matching that of the page to be deleted in !heapkeyspace indexes). Note
+ * that we will update the stack entry(s) to reflect current downlink
+ * positions --- this is essentially the same as the corresponding step of
+ * splitting, and is not expected to affect caller. The caller should
+ * initialize *target and *rightsib to the leaf page and its right sibling.
*
* Note: it's OK to release page locks on any internal pages between the leaf
* and *topparent, because a safe deletion can't become unsafe due to
@@ -1149,8 +1219,10 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
BlockNumber leftsib;
/*
- * Locate the downlink of "child" in the parent (updating the stack entry
- * if needed)
+ * Locate the downlink of "child" in the parent, updating the stack entry
+ * if needed. This is how !heapkeyspace indexes deal with having
+ * non-unique high keys in leaf level pages. Even heapkeyspace indexes
+ * can have a stale stack due to insertions into the parent.
*/
stack->bts_btentry = child;
pbuf = _bt_getstackbuf(rel, stack);
@@ -1362,9 +1434,10 @@ _bt_pagedel(Relation rel, Buffer buf)
{
/*
* We need an approximate pointer to the page's parent page. We
- * use the standard search mechanism to search for the page's high
- * key; this will give us a link to either the current parent or
- * someplace to its left (if there are multiple equal high keys).
+ * use a variant of the standard search mechanism to search for
+ * the page's high key; this will give us a link to either the
+ * current parent or someplace to its left (if there are multiple
+ * equal high keys, which is possible with !heapkeyspace indexes).
*
* Also check if this is the right-half of an incomplete split
* (see comment above).
@@ -1422,7 +1495,8 @@ _bt_pagedel(Relation rel, Buffer buf)
/* we need an insertion scan key for the search, so build one */
itup_key = _bt_mkscankey(rel, targetkey);
- /* get stack to leaf page by searching index */
+ /* find the leftmost leaf page with matching pivot/high key */
+ itup_key->pivotsearch = true;
stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
/* don't need a lock or second pin on the page */
_bt_relbuf(rel, lbuf);
@@ -1969,7 +2043,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
if (BufferIsValid(metabuf))
{
/* upgrade metapage if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = rightsib;
metad->btm_fastlevel = targetlevel;
@@ -2017,6 +2091,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
{
XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ xlmeta.version = metad->btm_version;
xlmeta.root = metad->btm_root;
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;