diff options
author | Marc G. Fournier <scrappy@hub.org> | 1996-07-09 06:22:35 +0000 |
---|---|---|
committer | Marc G. Fournier <scrappy@hub.org> | 1996-07-09 06:22:35 +0000 |
commit | d31084e9d1118b25fd16580d9d8c2924b5740dff (patch) | |
tree | 3179e66307d54df9c7b966543550e601eb55e668 /src/backend/access/nbtree/nbtpage.c | |
download | postgresql-PG95-1_01.tar.gz postgresql-PG95-1_01.zip |
Postgres95 1.01 Distribution - Virgin SourcesPG95-1_01
Diffstat (limited to 'src/backend/access/nbtree/nbtpage.c')
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 523 |
1 files changed, 523 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c new file mode 100644 index 00000000000..ce411a80d11 --- /dev/null +++ b/src/backend/access/nbtree/nbtpage.c @@ -0,0 +1,523 @@ +/*------------------------------------------------------------------------- + * + * btpage.c-- + * BTree-specific page management code for the Postgres btree access + * method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * Postgres btree pages look like ordinary relation pages. The opaque + * data at high addresses includes pointers to left and right siblings + * and flag data describing page state. The first page in a btree, page + * zero, is special -- it stores meta-information describing the tree. + * Pages one and higher store the actual tree data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/nbtree.h" + +#define BTREE_METAPAGE 0 +#define BTREE_MAGIC 0x053162 +#define BTREE_VERSION 0 + +typedef struct BTMetaPageData { + uint32 btm_magic; + uint32 btm_version; + BlockNumber btm_root; +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0]) + +extern bool BuildingBtree; + +/* + * We use high-concurrency locking on btrees. There are two cases in + * which we don't do locking. One is when we're building the btree. + * Since the creating transaction has not committed, no one can see + * the index, and there's no reason to share locks. The second case + * is when we're just starting up the database system. We use some + * special-purpose initialization code in the relation cache manager + * (see utils/cache/relcache.c) to allow us to do indexed scans on + * the system catalogs before we'd normally be able to. This happens + * before the lock table is fully initialized, so we can't use it. + * Strictly speaking, this violates 2pl, but we don't do 2pl on the + * system catalogs anyway, so I declare this to be okay. + */ + +#define USELOCKING (!BuildingBtree && !IsInitProcessingMode()) + +/* + * _bt_metapinit() -- Initialize the metadata page of a btree. + */ +void +_bt_metapinit(Relation rel) +{ + Buffer buf; + Page pg; + int nblocks; + BTMetaPageData metad; + BTPageOpaque op; + + /* can't be sharing this with anyone, now... */ + if (USELOCKING) + RelationSetLockForWrite(rel); + + if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) { + elog(WARN, "Cannot initialize non-empty btree %s", + RelationGetRelationName(rel)); + } + + buf = ReadBuffer(rel, P_NEW); + pg = BufferGetPage(buf); + _bt_pageinit(pg, BufferGetPageSize(buf)); + + metad.btm_magic = BTREE_MAGIC; + metad.btm_version = BTREE_VERSION; + metad.btm_root = P_NONE; + memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); + + op = (BTPageOpaque) PageGetSpecialPointer(pg); + op->btpo_flags = BTP_META; + + WriteBuffer(buf); + + /* all done */ + if (USELOCKING) + RelationUnsetLockForWrite(rel); +} + +/* + * _bt_checkmeta() -- Verify that the metadata stored in a btree are + * reasonable. + */ +void +_bt_checkmeta(Relation rel) +{ + Buffer metabuf; + Page metap; + BTMetaPageData *metad; + BTPageOpaque op; + int nblocks; + + /* if the relation is empty, this is init time; don't complain */ + if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0) + return; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metap = BufferGetPage(metabuf); + op = (BTPageOpaque) PageGetSpecialPointer(metap); + if (!(op->btpo_flags & BTP_META)) { + elog(WARN, "Invalid metapage for index %s", + RelationGetRelationName(rel)); + } + metad = BTPageGetMeta(metap); + + if (metad->btm_magic != BTREE_MAGIC) { + elog(WARN, "Index %s is not a btree", + RelationGetRelationName(rel)); + } + + if (metad->btm_version != BTREE_VERSION) { + elog(WARN, "Version mismatch on %s: version %d file, version %d code", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION); + } + + _bt_relbuf(rel, metabuf, BT_READ); +} + +/* + * _bt_getroot() -- Get the root page of the btree. + * + * Since the root page can move around the btree file, we have to read + * its location from the metadata page, and then read the root page + * itself. If no root page exists yet, we have to create one. The + * standard class of race conditions exists here; I think I covered + * them all in the Hopi Indian rain dance of lock requests below. + * + * We pass in the access type (BT_READ or BT_WRITE), and return the + * root page's buffer with the appropriate lock type set. Reference + * count on the root page gets bumped by ReadBuffer. The metadata + * page is unlocked and unreferenced by this process when this routine + * returns. + */ +Buffer +_bt_getroot(Relation rel, int access) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + Page rootpg; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* if no root page initialized yet, do it */ + if (metad->btm_root == P_NONE) { + + /* turn our read lock in for a write lock */ + _bt_relbuf(rel, metabuf, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* + * Race condition: if someone else initialized the metadata between + * the time we released the read lock and acquired the write lock, + * above, we want to avoid doing it again. + */ + + if (metad->btm_root == P_NONE) { + + /* + * Get, initialize, write, and leave a lock of the appropriate + * type on the new root page. Since this is the first page in + * the tree, it's a leaf. + */ + + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootblkno = BufferGetBlockNumber(rootbuf); + rootpg = BufferGetPage(rootbuf); + metad->btm_root = rootblkno; + _bt_pageinit(rootpg, BufferGetPageSize(rootbuf)); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT); + _bt_wrtnorelbuf(rel, rootbuf); + + /* swap write lock for read lock, if appropriate */ + if (access != BT_WRITE) { + _bt_setpagelock(rel, rootblkno, BT_READ); + _bt_unsetpagelock(rel, rootblkno, BT_WRITE); + } + + /* okay, metadata is correct */ + _bt_wrtbuf(rel, metabuf); + } else { + + /* + * Metadata initialized by someone else. In order to guarantee + * no deadlocks, we have to release the metadata page and start + * all over again. + */ + + _bt_relbuf(rel, metabuf, BT_WRITE); + return (_bt_getroot(rel, access)); + } + } else { + rootbuf = _bt_getbuf(rel, metad->btm_root, access); + + /* done with the meta page */ + _bt_relbuf(rel, metabuf, BT_READ); + } + + /* + * Race condition: If the root page split between the time we looked + * at the metadata page and got the root buffer, then we got the wrong + * buffer. + */ + + rootpg = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + if (!(rootopaque->btpo_flags & BTP_ROOT)) { + + /* it happened, try again */ + _bt_relbuf(rel, rootbuf, access); + return (_bt_getroot(rel, access)); + } + + /* + * By here, we have a correct lock on the root block, its reference + * count is correct, and we have no lock set on the metadata page. + * Return the root block. + */ + + return (rootbuf); +} + +/* + * _bt_getbuf() -- Get a buffer by block number for read or write. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer its reference count is correct. + */ +Buffer +_bt_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + Page page; + + /* + * If we want a new block, we can't set a lock of the appropriate type + * until we've instantiated the buffer. + */ + + if (blkno != P_NEW) { + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + + buf = ReadBuffer(rel, blkno); + } else { + buf = ReadBuffer(rel, blkno); + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); + _bt_pageinit(page, BufferGetPageSize(buf)); + + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + } + + /* ref count and lock type are correct */ + return (buf); +} + +/* + * _bt_relbuf() -- release a locked buffer. + */ +void +_bt_relbuf(Relation rel, Buffer buf, int access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + + /* access had better be one of read or write */ + if (access == BT_WRITE) + _bt_unsetpagelock(rel, blkno, BT_WRITE); + else + _bt_unsetpagelock(rel, blkno, BT_READ); + + ReleaseBuffer(buf); +} + +/* + * _bt_wrtbuf() -- write a btree page to disk. + * + * This routine releases the lock held on the buffer and our reference + * to it. It is an error to call _bt_wrtbuf() without a write lock + * or a reference to the buffer. + */ +void +_bt_wrtbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteBuffer(buf); + _bt_unsetpagelock(rel, blkno, BT_WRITE); +} + +/* + * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release + * our reference or lock. + * + * It is an error to call _bt_wrtnorelbuf() without a write lock + * or a reference to the buffer. + */ +void +_bt_wrtnorelbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteNoReleaseBuffer(buf); +} + +/* + * _bt_pageinit() -- Initialize a new page. + */ +void +_bt_pageinit(Page page, Size size) +{ + /* + * Cargo-cult programming -- don't really need this to be zero, but + * creating new pages is an infrequent occurrence and it makes me feel + * good when I know they're empty. + */ + + memset(page, 0, size); + + PageInit(page, size, sizeof(BTPageOpaqueData)); +} + +/* + * _bt_metaproot() -- Change the root page of the btree. + * + * Lehman and Yao require that the root page move around in order to + * guarantee deadlock-free short-term, fine-granularity locking. When + * we split the root page, we record the new parent in the metadata page + * for the relation. This routine does the work. + * + * No direct preconditions, but if you don't have the a write lock on + * at least the old root page when you call this, you're making a big + * mistake. On exit, metapage data is correct and we no longer have + * a reference to or lock on the metapage. + */ +void +_bt_metaproot(Relation rel, BlockNumber rootbknum) +{ + Buffer metabuf; + Page metap; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metap = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metap); + metad->btm_root = rootbknum; + _bt_wrtbuf(rel, metabuf); +} + +/* + * _bt_getstackbuf() -- Walk back up the tree one step, and find the item + * we last looked at in the parent. + * + * This is possible because we save a bit image of the last item + * we looked at in the parent, and the update algorithm guarantees + * that if items above us in the tree move, they only move right. + */ +Buffer +_bt_getstackbuf(Relation rel, BTStack stack, int access) +{ + Buffer buf; + BlockNumber blkno; + OffsetNumber start, offnum, maxoff; + OffsetNumber i; + Page page; + ItemId itemid; + BTItem item; + BTPageOpaque opaque; + + blkno = stack->bts_blkno; + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + if (maxoff >= stack->bts_offset) { + itemid = PageGetItemId(page, stack->bts_offset); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + + /* if the item has just moved right on this page, we're done */ + for (i = OffsetNumberNext(stack->bts_offset); + i <= maxoff; + i = OffsetNumberNext(i)) { + itemid = PageGetItemId(page, i); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + } + } + + /* by here, the item we're looking for moved right at least one page */ + for (;;) { + blkno = opaque->btpo_next; + if (P_RIGHTMOST(opaque)) + elog(FATAL, "my bits moved right off the end of the world!"); + + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we have a right sibling, step over the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* see if it's on this page */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) { + itemid = PageGetItemId(page, offnum); + item = (BTItem) PageGetItem(page, itemid); + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + } + } +} + +void +_bt_setpagelock(Relation rel, BlockNumber blkno, int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationSetSingleWLockPage(rel, &iptr); + else + RelationSetSingleRLockPage(rel, &iptr); + } +} + +void +_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationUnsetSingleWLockPage(rel, &iptr); + else + RelationUnsetSingleRLockPage(rel, &iptr); + } +} + +void +_bt_pagedel(Relation rel, ItemPointer tid) +{ + Buffer buf; + Page page; + BlockNumber blkno; + OffsetNumber offno; + + blkno = ItemPointerGetBlockNumber(tid); + offno = ItemPointerGetOffsetNumber(tid); + + buf = _bt_getbuf(rel, blkno, BT_WRITE); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offno); + + /* write the buffer and release the lock */ + _bt_wrtbuf(rel, buf); +} |