diff options
Diffstat (limited to 'src/backend/access/nbtree')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 268 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 194 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 180 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 279 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 138 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 86 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtxlog.c | 20 |
7 files changed, 572 insertions, 593 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index c73ba358ec1..33c7612aac5 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.126 2005/10/12 17:18:03 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.127 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -93,30 +93,29 @@ top: /* * If the page was split between the time that we surrendered our read - * lock and acquired our write lock, then this page may no longer be - * the right place for the key we want to insert. In this case, we - * need to move right in the tree. See Lehman and Yao for an - * excruciatingly precise description. + * lock and acquired our write lock, then this page may no longer be the + * right place for the key we want to insert. In this case, we need to + * move right in the tree. See Lehman and Yao for an excruciatingly + * precise description. */ buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE); /* - * If we're not allowing duplicates, make sure the key isn't already - * in the index. + * If we're not allowing duplicates, make sure the key isn't already in + * the index. * - * NOTE: obviously, _bt_check_unique can only detect keys that are - * already in the index; so it cannot defend against concurrent - * insertions of the same key. We protect against that by means of - * holding a write lock on the target page. Any other would-be - * inserter of the same key must acquire a write lock on the same - * target page, so only one would-be inserter can be making the check - * at one time. Furthermore, once we are past the check we hold write - * locks continuously until we have performed our insertion, so no - * later inserter can fail to see our insertion. (This requires some - * care in _bt_insertonpg.) + * NOTE: obviously, _bt_check_unique can only detect keys that are already in + * the index; so it cannot defend against concurrent insertions of the + * same key. We protect against that by means of holding a write lock on + * the target page. Any other would-be inserter of the same key must + * acquire a write lock on the same target page, so only one would-be + * inserter can be making the check at one time. Furthermore, once we are + * past the check we hold write locks continuously until we have performed + * our insertion, so no later inserter can fail to see our insertion. + * (This requires some care in _bt_insertonpg.) * - * If we must wait for another xact, we release the lock while waiting, - * and then must start over completely. + * If we must wait for another xact, we release the lock while waiting, and + * then must start over completely. */ if (index_is_unique) { @@ -167,8 +166,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, maxoff = PageGetMaxOffsetNumber(page); /* - * Find first item >= proposed new item. Note we could also get a - * pointer to end-of-page here. + * Find first item >= proposed new item. Note we could also get a pointer + * to end-of-page here. */ offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); @@ -194,24 +193,24 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, /* * We can skip items that are marked killed. * - * Formerly, we applied _bt_isequal() before checking the kill - * flag, so as to fall out of the item loop as soon as - * possible. However, in the presence of heavy update activity - * an index may contain many killed items with the same key; - * running _bt_isequal() on each killed item gets expensive. - * Furthermore it is likely that the non-killed version of - * each key appears first, so that we didn't actually get to - * exit any sooner anyway. So now we just advance over killed - * items as quickly as we can. We only apply _bt_isequal() - * when we get to a non-killed item or the end of the page. + * Formerly, we applied _bt_isequal() before checking the kill flag, + * so as to fall out of the item loop as soon as possible. + * However, in the presence of heavy update activity an index may + * contain many killed items with the same key; running + * _bt_isequal() on each killed item gets expensive. Furthermore + * it is likely that the non-killed version of each key appears + * first, so that we didn't actually get to exit any sooner + * anyway. So now we just advance over killed items as quickly as + * we can. We only apply _bt_isequal() when we get to a non-killed + * item or the end of the page. */ if (!ItemIdDeleted(curitemid)) { /* - * _bt_compare returns 0 for (1,NULL) and (1,NULL) - - * this's how we handling NULLs - and so we must not use - * _bt_compare in real comparison, but only for - * ordering/finding items on pages. - vadim 03/24/97 + * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's + * how we handling NULLs - and so we must not use _bt_compare + * in real comparison, but only for ordering/finding items on + * pages. - vadim 03/24/97 */ if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey)) break; /* we're past all the equal tuples */ @@ -246,15 +245,15 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, */ ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("duplicate key violates unique constraint \"%s\"", - RelationGetRelationName(rel)))); + errmsg("duplicate key violates unique constraint \"%s\"", + RelationGetRelationName(rel)))); } else if (htup.t_data != NULL) { /* - * Hmm, if we can't see the tuple, maybe it can be - * marked killed. This logic should match - * index_getnext and btgettuple. + * Hmm, if we can't see the tuple, maybe it can be marked + * killed. This logic should match index_getnext and + * btgettuple. */ LockBuffer(hbuffer, BUFFER_LOCK_SHARE); if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin, @@ -377,15 +376,15 @@ _bt_insertonpg(Relation rel, itemsz = IndexTupleDSize(btitem->bti_itup) + (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but - * we need to be consistent */ + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ /* - * Check whether the item can fit on a btree page at all. (Eventually, - * we ought to try to apply TOAST methods if not.) We actually need to - * be able to fit three items on every page, so restrict any one item - * to 1/3 the per-page available space. Note that at this point, - * itemsz doesn't include the ItemId. + * Check whether the item can fit on a btree page at all. (Eventually, we + * ought to try to apply TOAST methods if not.) We actually need to be + * able to fit three items on every page, so restrict any one item to 1/3 + * the per-page available space. Note that at this point, itemsz doesn't + * include the ItemId. */ if (itemsz > BTMaxItemSize(page)) ereport(ERROR, @@ -393,9 +392,9 @@ _bt_insertonpg(Relation rel, errmsg("index row size %lu exceeds btree maximum, %lu", (unsigned long) itemsz, (unsigned long) BTMaxItemSize(page)), - errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" - "Consider a function index of an MD5 hash of the value, " - "or use full text indexing."))); + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."))); /* * Determine exactly where new item will go. @@ -432,11 +431,11 @@ _bt_insertonpg(Relation rel, /* * step right to next non-dead page * - * must write-lock that page before releasing write lock on - * current page; else someone else's _bt_check_unique scan - * could fail to see our insertion. write locks on - * intermediate dead pages won't do because we don't know when - * they will get de-linked from the tree. + * must write-lock that page before releasing write lock on current + * page; else someone else's _bt_check_unique scan could fail to + * see our insertion. write locks on intermediate dead pages + * won't do because we don't know when they will get de-linked + * from the tree. */ Buffer rbuf = InvalidBuffer; @@ -459,9 +458,9 @@ _bt_insertonpg(Relation rel, } /* - * Now we are on the right page, so find the insert position. If - * we moved right at all, we know we should insert at the start of - * the page, else must find the position by searching. + * Now we are on the right page, so find the insert position. If we + * moved right at all, we know we should insert at the start of the + * page, else must find the position by searching. */ if (movedright) newitemoff = P_FIRSTDATAKEY(lpageop); @@ -472,9 +471,9 @@ _bt_insertonpg(Relation rel, /* * Do we need to split the page to fit the item on it? * - * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, - * so this comparison is correct even though we appear to be - * accounting only for the item and not for its line pointer. + * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, so + * this comparison is correct even though we appear to be accounting only + * for the item and not for its line pointer. */ if (PageGetFreeSpace(page) < itemsz) { @@ -522,12 +521,11 @@ _bt_insertonpg(Relation rel, itup_blkno = BufferGetBlockNumber(buf); /* - * If we are doing this insert because we split a page that was - * the only one on its tree level, but was not the root, it may - * have been the "fast root". We need to ensure that the fast - * root link points at or above the current page. We can safely - * acquire a lock on the metapage here --- see comments for - * _bt_newroot(). + * If we are doing this insert because we split a page that was the + * only one on its tree level, but was not the root, it may have been + * the "fast root". We need to ensure that the fast root link points + * at or above the current page. We can safely acquire a lock on the + * metapage here --- see comments for _bt_newroot(). */ if (split_only_page) { @@ -692,11 +690,11 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; /* - * If the page we're splitting is not the rightmost page at its level - * in the tree, then the first entry on the page is the high key for - * the page. We need to copy that to the right half. Otherwise - * (meaning the rightmost page case), all the items on the right half - * will be user data. + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key for the + * page. We need to copy that to the right half. Otherwise (meaning the + * rightmost page case), all the items on the right half will be user + * data. */ rightoff = P_HIKEY; @@ -712,9 +710,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* - * The "high key" for the new left page will be the first key that's - * going to go into the new right page. This might be either the - * existing data item at position firstright, or the incoming tuple. + * The "high key" for the new left page will be the first key that's going + * to go into the new right page. This might be either the existing data + * item at position firstright, or the incoming tuple. */ leftoff = P_HIKEY; if (!newitemonleft && newitemoff == firstright) @@ -806,8 +804,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, /* * We have to grab the right sibling (if any) and fix the prev pointer * there. We are guaranteed that this is deadlock-free since no other - * writer will be holding a lock on that page and trying to move left, - * and all readers release locks on a page before trying to fetch its + * writer will be holding a lock on that page and trying to move left, and + * all readers release locks on a page before trying to fetch its * neighbors. */ @@ -821,8 +819,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* - * Right sibling is locked, new siblings are prepared, but original - * page is not updated yet. Log changes before continuing. + * Right sibling is locked, new siblings are prepared, but original page + * is not updated yet. Log changes before continuing. * * NO EREPORT(ERROR) till right sibling is updated. */ @@ -850,10 +848,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, xlrec.level = lopaque->btpo.level; /* - * Direct access to page is not good but faster - we should - * implement some new func in page API. Note we only store the - * tuples themselves, knowing that the item pointers are in the - * same order and can be reconstructed by scanning the tuples. + * Direct access to page is not good but faster - we should implement + * some new func in page API. Note we only store the tuples + * themselves, knowing that the item pointers are in the same order + * and can be reconstructed by scanning the tuples. */ xlrec.leftlen = ((PageHeader) leftpage)->pd_special - ((PageHeader) leftpage)->pd_upper; @@ -903,13 +901,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* - * By here, the original data page has been split into two new halves, - * and these are correct. The algorithm requires that the left page - * never move during a split, so we copy the new left page back on top - * of the original. Note that this is not a waste of time, since we - * also require (in the page management code) that the center of a - * page always be clean, and the most efficient way to guarantee this - * is just to compact the data by reinserting it into a new left page. + * By here, the original data page has been split into two new halves, and + * these are correct. The algorithm requires that the left page never + * move during a split, so we copy the new left page back on top of the + * original. Note that this is not a waste of time, since we also require + * (in the page management code) that the center of a page always be + * clean, and the most efficient way to guarantee this is just to compact + * the data by reinserting it into a new left page. */ PageRestoreTempPage(leftpage, origpage); @@ -984,13 +982,13 @@ _bt_findsplitloc(Relation rel, MAXALIGN(sizeof(BTPageOpaqueData)); /* - * Finding the best possible split would require checking all the - * possible split points, because of the high-key and left-key special - * cases. That's probably more work than it's worth; instead, stop as - * soon as we find a "good-enough" split, where good-enough is defined - * as an imbalance in free space of no more than pagesize/16 - * (arbitrary...) This should let us stop near the middle on most - * pages, instead of plowing to the end. + * Finding the best possible split would require checking all the possible + * split points, because of the high-key and left-key special cases. + * That's probably more work than it's worth; instead, stop as soon as we + * find a "good-enough" split, where good-enough is defined as an + * imbalance in free space of no more than pagesize/16 (arbitrary...) This + * should let us stop near the middle on most pages, instead of plowing to + * the end. */ goodenough = leftspace / 16; @@ -1006,8 +1004,8 @@ _bt_findsplitloc(Relation rel, dataitemtotal = rightspace - (int) PageGetFreeSpace(page); /* - * Scan through the data items and calculate space usage for a split - * at each possible position. + * Scan through the data items and calculate space usage for a split at + * each possible position. */ dataitemstoleft = 0; maxoff = PageGetMaxOffsetNumber(page); @@ -1024,9 +1022,9 @@ _bt_findsplitloc(Relation rel, itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); /* - * We have to allow for the current item becoming the high key of - * the left page; therefore it counts against left space as well - * as right space. + * We have to allow for the current item becoming the high key of the + * left page; therefore it counts against left space as well as right + * space. */ leftfree = leftspace - dataitemstoleft - (int) itemsz; rightfree = rightspace - (dataitemtotal - dataitemstoleft); @@ -1058,8 +1056,8 @@ _bt_findsplitloc(Relation rel, } /* - * I believe it is not possible to fail to find a feasible split, but - * just in case ... + * I believe it is not possible to fail to find a feasible split, but just + * in case ... */ if (!state.have_split) elog(ERROR, "could not find a feasible split point for \"%s\"", @@ -1105,8 +1103,7 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright, { /* * On a rightmost page, try to equalize right free space with - * twice the left free space. See comments for - * _bt_findsplitloc. + * twice the left free space. See comments for _bt_findsplitloc. */ delta = (2 * leftfree) - rightfree; } @@ -1153,19 +1150,18 @@ _bt_insert_parent(Relation rel, bool is_only) { /* - * Here we have to do something Lehman and Yao don't talk about: deal - * with a root split and construction of a new root. If our stack is - * empty then we have just split a node on what had been the root - * level when we descended the tree. If it was still the root then we - * perform a new-root construction. If it *wasn't* the root anymore, - * search to find the next higher level that someone constructed - * meanwhile, and find the right place to insert as for the normal - * case. + * Here we have to do something Lehman and Yao don't talk about: deal with + * a root split and construction of a new root. If our stack is empty + * then we have just split a node on what had been the root level when we + * descended the tree. If it was still the root then we perform a + * new-root construction. If it *wasn't* the root anymore, search to find + * the next higher level that someone constructed meanwhile, and find the + * right place to insert as for the normal case. * - * If we have to search for the parent level, we do so by re-descending - * from the root. This is not super-efficient, but it's rare enough - * not to matter. (This path is also taken when called from WAL - * recovery --- we have no stack in that case.) + * If we have to search for the parent level, we do so by re-descending from + * the root. This is not super-efficient, but it's rare enough not to + * matter. (This path is also taken when called from WAL recovery --- we + * have no stack in that case.) */ if (is_root) { @@ -1219,9 +1215,9 @@ _bt_insert_parent(Relation rel, /* * Find the parent buffer and get the parent page. * - * Oops - if we were moved right then we need to change stack item! - * We want to find parent pointing to where we are, right ? - - * vadim 05/27/97 + * Oops - if we were moved right then we need to change stack item! We + * want to find parent pointing to where we are, right ? - vadim + * 05/27/97 */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), bknum, P_HIKEY); @@ -1291,9 +1287,9 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) maxoff = PageGetMaxOffsetNumber(page); /* - * start = InvalidOffsetNumber means "search the whole page". - * We need this test anyway due to possibility that page has a - * high key now when it didn't before. + * start = InvalidOffsetNumber means "search the whole page". We + * need this test anyway due to possibility that page has a high + * key now when it didn't before. */ if (start < minoff) start = minoff; @@ -1307,8 +1303,8 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) /* * These loops will check every item on the page --- but in an - * order that's attuned to the probability of where it - * actually is. Scan to the right first, then to the left. + * order that's attuned to the probability of where it actually + * is. Scan to the right first, then to the left. */ for (offnum = start; offnum <= maxoff; @@ -1424,9 +1420,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) metad->btm_fastlevel = rootopaque->btpo.level; /* - * Create downlink item for left page (old root). Since this will be - * the first item in a non-leaf page, it implicitly has minus-infinity - * key value, so we need not store any actual key in it. + * Create downlink item for left page (old root). Since this will be the + * first item in a non-leaf page, it implicitly has minus-infinity key + * value, so we need not store any actual key in it. */ itemsz = sizeof(BTItemData); new_item = (BTItem) palloc(itemsz); @@ -1434,17 +1430,17 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY); /* - * Insert the left page pointer into the new root page. The root page - * is the rightmost page on its level so there is no "high key" in it; - * the two items will go into positions P_HIKEY and P_FIRSTKEY. + * Insert the left page pointer into the new root page. The root page is + * the rightmost page on its level so there is no "high key" in it; the + * two items will go into positions P_HIKEY and P_FIRSTKEY. */ if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber) elog(PANIC, "failed to add leftkey to new root page"); pfree(new_item); /* - * Create downlink item for right page. The key for it is obtained - * from the "high key" position in the left page. + * Create downlink item for right page. The key for it is obtained from + * the "high key" position in the left page. */ itemid = PageGetItemId(lpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); @@ -1476,8 +1472,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rdata[0].next = &(rdata[1]); /* - * Direct access to page is not good but faster - we should - * implement some new func in page API. + * Direct access to page is not good but faster - we should implement + * some new func in page API. */ rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper; rdata[1].len = ((PageHeader) rootpage)->pd_special - diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 52d60abaec0..927860030c8 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.87 2005/08/12 14:34:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.88 2005/10/15 02:49:09 momjian Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -115,8 +115,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metaopaque->btpo_flags = BTP_META; /* - * Set pd_lower just past the end of the metadata. This is not - * essential but it makes the page look compressible to xlog.c. + * Set pd_lower just past the end of the metadata. This is not essential + * but it makes the page look compressible to xlog.c. */ ((PageHeader) page)->pd_lower = ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; @@ -198,26 +198,26 @@ _bt_getroot(Relation rel, int access) LockBuffer(metabuf, BT_WRITE); /* - * Race condition: if someone else initialized the metadata - * between the time we released the read lock and acquired the - * write lock, we must avoid doing it again. + * Race condition: if someone else initialized the metadata between + * the time we released the read lock and acquired the write lock, we + * must avoid doing it again. */ if (metad->btm_root != P_NONE) { /* - * Metadata initialized by someone else. In order to - * guarantee no deadlocks, we have to release the metadata - * page and start all over again. (Is that really true? But - * it's hardly worth trying to optimize this case.) + * Metadata initialized by someone else. In order to guarantee no + * deadlocks, we have to release the metadata page and start all + * over again. (Is that really true? But it's hardly worth trying + * to optimize this case.) */ _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } /* - * Get, initialize, write, and leave a lock of the appropriate - * type on the new root page. Since this is the first page in the - * tree, it's a leaf as well as the root. + * Get, initialize, write, and leave a lock of the appropriate type on + * the new root page. Since this is the first page in the tree, it's + * a leaf as well as the root. */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); @@ -266,9 +266,9 @@ _bt_getroot(Relation rel, int access) _bt_wrtnorelbuf(rel, rootbuf); /* - * swap root write lock for read lock. There is no danger of - * anyone else accessing the new root page while it's unlocked, - * since no one else knows where it is yet. + * swap root write lock for read lock. There is no danger of anyone + * else accessing the new root page while it's unlocked, since no one + * else knows where it is yet. */ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); @@ -312,8 +312,8 @@ _bt_getroot(Relation rel, int access) } /* - * By here, we have a pin and read lock on the root page, and no lock - * set on the metadata page. Return the root page's buffer. + * By here, we have a pin and read lock on the root page, and no lock set + * on the metadata page. Return the root page's buffer. */ return rootbuf; } @@ -435,27 +435,26 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) /* * First see if the FSM knows of any free pages. * - * We can't trust the FSM's report unreservedly; we have to check - * that the page is still free. (For example, an already-free - * page could have been re-used between the time the last VACUUM - * scanned it and the time the VACUUM made its FSM updates.) + * We can't trust the FSM's report unreservedly; we have to check that + * the page is still free. (For example, an already-free page could + * have been re-used between the time the last VACUUM scanned it and + * the time the VACUUM made its FSM updates.) * - * In fact, it's worse than that: we can't even assume that it's safe - * to take a lock on the reported page. If somebody else has a - * lock on it, or even worse our own caller does, we could - * deadlock. (The own-caller scenario is actually not improbable. - * Consider an index on a serial or timestamp column. Nearly all - * splits will be at the rightmost page, so it's entirely likely - * that _bt_split will call us while holding a lock on the page - * most recently acquired from FSM. A VACUUM running concurrently - * with the previous split could well have placed that page back - * in FSM.) + * In fact, it's worse than that: we can't even assume that it's safe to + * take a lock on the reported page. If somebody else has a lock on + * it, or even worse our own caller does, we could deadlock. (The + * own-caller scenario is actually not improbable. Consider an index + * on a serial or timestamp column. Nearly all splits will be at the + * rightmost page, so it's entirely likely that _bt_split will call us + * while holding a lock on the page most recently acquired from FSM. + * A VACUUM running concurrently with the previous split could well + * have placed that page back in FSM.) * - * To get around that, we ask for only a conditional lock on the - * reported page. If we fail, then someone else is using the - * page, and we may reasonably assume it's not free. (If we - * happen to be wrong, the worst consequence is the page will be - * lost to use till the next VACUUM, which is no big problem.) + * To get around that, we ask for only a conditional lock on the reported + * page. If we fail, then someone else is using the page, and we may + * reasonably assume it's not free. (If we happen to be wrong, the + * worst consequence is the page will be lost to use till the next + * VACUUM, which is no big problem.) */ for (;;) { @@ -486,10 +485,10 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) /* * Extend the relation by one page. * - * We have to use a lock to ensure no one else is extending the rel - * at the same time, else we will both try to initialize the same - * new page. We can skip locking for new or temp relations, - * however, since no one else could be accessing them. + * We have to use a lock to ensure no one else is extending the rel at + * the same time, else we will both try to initialize the same new + * page. We can skip locking for new or temp relations, however, + * since no one else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); @@ -504,8 +503,8 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) /* * Release the file-extension lock; it's now OK for someone else to * extend the relation some more. Note that we cannot release this - * lock before we have buffer lock on the new page, or we risk a - * race condition against btvacuumcleanup --- see comments therein. + * lock before we have buffer lock on the new page, or we risk a race + * condition against btvacuumcleanup --- see comments therein. */ if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); @@ -614,10 +613,10 @@ _bt_page_recyclable(Page page) BTPageOpaque opaque; /* - * It's possible to find an all-zeroes page in an index --- for - * example, a backend might successfully extend the relation one page - * and then crash before it is able to make a WAL entry for adding the - * page. If we find a zeroed page then reclaim it. + * It's possible to find an all-zeroes page in an index --- for example, a + * backend might successfully extend the relation one page and then crash + * before it is able to make a WAL entry for adding the page. If we find a + * zeroed page then reclaim it. */ if (PageIsNew(page)) return true; @@ -672,9 +671,9 @@ _bt_delitems(Relation rel, Buffer buf, rdata[0].next = &(rdata[1]); /* - * The target-offsets array is not in the buffer, but pretend that - * it is. When XLogInsert stores the whole buffer, the offsets - * array need not be stored too. + * The target-offsets array is not in the buffer, but pretend that it + * is. When XLogInsert stores the whole buffer, the offsets array + * need not be stored too. */ if (nitems > 0) { @@ -747,8 +746,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) BTPageOpaque opaque; /* - * We can never delete rightmost pages nor root pages. While at it, - * check that page is not already deleted and is empty. + * We can never delete rightmost pages nor root pages. While at it, check + * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -760,8 +759,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) } /* - * Save info about page, including a copy of its high key (it must - * have one, being non-rightmost). + * Save info about page, including a copy of its high key (it must have + * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; @@ -770,11 +769,11 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) targetkey = CopyBTItem((BTItem) PageGetItem(page, itemid)); /* - * We need to get an approximate pointer to the page's parent page. - * Use the standard search mechanism to search for the page's high - * key; this will give us a link to either the current parent or - * someplace to its left (if there are multiple equal high keys). To - * avoid deadlocks, we'd better drop the target page lock first. + * We need to get an approximate pointer to the page's parent page. Use + * the standard search mechanism to search for the page's high key; this + * will give us a link to either the current parent or someplace to its + * left (if there are multiple equal high keys). To avoid deadlocks, we'd + * better drop the target page lock first. */ _bt_relbuf(rel, buf); /* we need a scan key to do our search, so build one */ @@ -786,9 +785,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) _bt_relbuf(rel, lbuf); /* - * If we are trying to delete an interior page, _bt_search did more - * than we needed. Locate the stack item pointing to our parent - * level. + * If we are trying to delete an interior page, _bt_search did more than + * we needed. Locate the stack item pointing to our parent level. */ ilevel = 0; for (;;) @@ -803,16 +801,15 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) /* * We have to lock the pages we need to modify in the standard order: - * moving right, then up. Else we will deadlock against other - * writers. + * moving right, then up. Else we will deadlock against other writers. * - * So, we need to find and write-lock the current left sibling of the - * target page. The sibling that was current a moment ago could have - * split, so we may have to move right. This search could fail if - * either the sibling or the target page was deleted by someone else - * meanwhile; if so, give up. (Right now, that should never happen, - * since page deletion is only done in VACUUM and there shouldn't be - * multiple VACUUMs concurrently on the same table.) + * So, we need to find and write-lock the current left sibling of the target + * page. The sibling that was current a moment ago could have split, so + * we may have to move right. This search could fail if either the + * sibling or the target page was deleted by someone else meanwhile; if + * so, give up. (Right now, that should never happen, since page deletion + * is only done in VACUUM and there shouldn't be multiple VACUUMs + * concurrently on the same table.) */ if (leftsib != P_NONE) { @@ -839,19 +836,18 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) lbuf = InvalidBuffer; /* - * Next write-lock the target page itself. It should be okay to take - * just a write lock not a superexclusive lock, since no scans would - * stop on an empty page. + * Next write-lock the target page itself. It should be okay to take just + * a write lock not a superexclusive lock, since no scans would stop on an + * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* - * Check page is still empty etc, else abandon deletion. The empty - * check is necessary since someone else might have inserted into it - * while we didn't have it locked; the others are just for paranoia's - * sake. + * Check page is still empty etc, else abandon deletion. The empty check + * is necessary since someone else might have inserted into it while we + * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) @@ -872,9 +868,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* - * Next find and write-lock the current parent of the target page. - * This is essentially the same as the corresponding step of - * splitting. + * Next find and write-lock the current parent of the target page. This is + * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), target, P_HIKEY); @@ -887,8 +882,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) /* * If the target is the rightmost child of its parent, then we can't - * delete, unless it's also the only child --- in which case the - * parent changes to half-dead status. + * delete, unless it's also the only child --- in which case the parent + * changes to half-dead status. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -917,11 +912,10 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) } /* - * If we are deleting the next-to-last page on the target's level, - * then the rightsib is a candidate to become the new fast root. (In - * theory, it might be possible to push the fast root even further - * down, but the odds of doing so are slim, and the locking - * considerations daunting.) + * If we are deleting the next-to-last page on the target's level, then + * the rightsib is a candidate to become the new fast root. (In theory, it + * might be possible to push the fast root even further down, but the odds + * of doing so are slim, and the locking considerations daunting.) * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). @@ -939,9 +933,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) metad = BTPageGetMeta(metapg); /* - * The expected case here is btm_fastlevel == targetlevel+1; - * if the fastlevel is <= targetlevel, something is wrong, and - * we choose to overwrite it to fix it. + * The expected case here is btm_fastlevel == targetlevel+1; if + * the fastlevel is <= targetlevel, something is wrong, and we + * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { @@ -961,9 +955,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) /* * Update parent. The normal case is a tad tricky because we want to - * delete the target's downlink and the *following* key. Easiest way - * is to copy the right sibling's downlink over the target downlink, - * and then delete the following item. + * delete the target's downlink and the *following* key. Easiest way is + * to copy the right sibling's downlink over the target downlink, and then + * delete the following item. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -992,8 +986,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) } /* - * Update siblings' side-links. Note the target page's side-links - * will continue to point to the siblings. + * Update siblings' side-links. Note the target page's side-links will + * continue to point to the siblings. */ if (BufferIsValid(lbuf)) { @@ -1123,10 +1117,10 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) _bt_wrtbuf(rel, lbuf); /* - * If parent became half dead, recurse to try to delete it. Otherwise, - * if right sibling is empty and is now the last child of the parent, - * recurse to try to delete it. (These cases cannot apply at the same - * time, though the second case might itself recurse to the first.) + * If parent became half dead, recurse to try to delete it. Otherwise, if + * right sibling is empty and is now the last child of the parent, recurse + * to try to delete it. (These cases cannot apply at the same time, + * though the second case might itself recurse to the first.) */ if (parent_half_dead) { diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index d4232c847f8..10e2fe6190d 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.131 2005/09/02 19:02:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.132 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -39,9 +39,9 @@ typedef struct BTSpool *spool; /* - * spool2 is needed only when the index is an unique index. Dead - * tuples are put into spool2 instead of spool in order to avoid - * uniqueness check. + * spool2 is needed only when the index is an unique index. Dead tuples + * are put into spool2 instead of spool in order to avoid uniqueness + * check. */ BTSpool *spool2; double indtuples; @@ -72,10 +72,10 @@ btbuild(PG_FUNCTION_ARGS) BTBuildState buildstate; /* - * bootstrap processing does something strange, so don't use - * sort/build for initial catalog indices. at some point i need to - * look harder at this. (there is some kind of incremental processing - * going on there.) -- pma 08/29/95 + * bootstrap processing does something strange, so don't use sort/build + * for initial catalog indices. at some point i need to look harder at + * this. (there is some kind of incremental processing going on there.) + * -- pma 08/29/95 */ buildstate.usefast = (FastBuild && IsNormalProcessingMode()); buildstate.isUnique = indexInfo->ii_Unique; @@ -91,8 +91,8 @@ btbuild(PG_FUNCTION_ARGS) #endif /* BTREE_BUILD_STATS */ /* - * We expect to be called exactly once for any index relation. If - * that's not the case, big trouble's what we have. + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", @@ -103,8 +103,8 @@ btbuild(PG_FUNCTION_ARGS) buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false); /* - * If building a unique index, put dead tuples in a second spool - * to keep them out of the uniqueness check. + * If building a unique index, put dead tuples in a second spool to + * keep them out of the uniqueness check. */ if (indexInfo->ii_Unique) buildstate.spool2 = _bt_spoolinit(index, false, true); @@ -129,8 +129,8 @@ btbuild(PG_FUNCTION_ARGS) /* * if we are doing bottom-up btree build, finish the build by (1) - * completing the sort of the spool file, (2) inserting the sorted - * tuples into btree pages and (3) building the upper levels. + * completing the sort of the spool file, (2) inserting the sorted tuples + * into btree pages and (3) building the upper levels. */ if (buildstate.usefast) { @@ -176,9 +176,8 @@ btbuildCallback(Relation index, btitem = _bt_formitem(itup); /* - * if we are doing bottom-up btree build, we insert the index into a - * spool file for subsequent processing. otherwise, we insert into - * the btree. + * if we are doing bottom-up btree build, we insert the index into a spool + * file for subsequent processing. otherwise, we insert into the btree. */ if (buildstate->usefast) { @@ -248,16 +247,16 @@ btgettuple(PG_FUNCTION_ARGS) bool res; /* - * If we've already initialized this scan, we can just advance it in - * the appropriate direction. If we haven't done so yet, we call a - * routine to get the first item in the scan. + * If we've already initialized this scan, we can just advance it in the + * appropriate direction. If we haven't done so yet, we call a routine to + * get the first item in the scan. */ if (ItemPointerIsValid(&(scan->currentItemData))) { /* - * Restore scan position using heap TID returned by previous call - * to btgettuple(). _bt_restscan() re-grabs the read lock on the - * buffer, too. + * Restore scan position using heap TID returned by previous call to + * btgettuple(). _bt_restscan() re-grabs the read lock on the buffer, + * too. */ _bt_restscan(scan); @@ -267,17 +266,16 @@ btgettuple(PG_FUNCTION_ARGS) if (scan->kill_prior_tuple) { /* - * Yes, so mark it by setting the LP_DELETE bit in the item - * flags. + * Yes, so mark it by setting the LP_DELETE bit in the item flags. */ offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->btso_curbuf); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; /* - * Since this can be redone later if needed, it's treated the - * same as a commit-hint-bit status update for heap tuples: we - * mark the buffer dirty but don't make a WAL log entry. + * Since this can be redone later if needed, it's treated the same + * as a commit-hint-bit status update for heap tuples: we mark the + * buffer dirty but don't make a WAL log entry. */ SetBufferCommitInfoNeedsSave(so->btso_curbuf); } @@ -306,11 +304,11 @@ btgettuple(PG_FUNCTION_ARGS) } /* - * Save heap TID to use it in _bt_restscan. Then release the read - * lock on the buffer so that we aren't blocking other backends. + * Save heap TID to use it in _bt_restscan. Then release the read lock on + * the buffer so that we aren't blocking other backends. * - * NOTE: we do keep the pin on the buffer! This is essential to ensure - * that someone else doesn't delete the index entry we are stopped on. + * NOTE: we do keep the pin on the buffer! This is essential to ensure that + * someone else doesn't delete the index entry we are stopped on. */ if (res) { @@ -333,7 +331,7 @@ Datum btgetmulti(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1); + ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1); int32 max_tids = PG_GETARG_INT32(2); int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3); BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -355,6 +353,7 @@ btgetmulti(PG_FUNCTION_ARGS) res = _bt_next(scan, ForwardScanDirection); else res = _bt_first(scan, ForwardScanDirection); + /* * Skip killed tuples if asked to. */ @@ -381,8 +380,8 @@ btgetmulti(PG_FUNCTION_ARGS) } /* - * Save heap TID to use it in _bt_restscan. Then release the read - * lock on the buffer so that we aren't blocking other backends. + * Save heap TID to use it in _bt_restscan. Then release the read lock on + * the buffer so that we aren't blocking other backends. */ if (res) { @@ -456,8 +455,8 @@ btrescan(PG_FUNCTION_ARGS) } /* - * Reset the scan keys. Note that keys ordering stuff moved to - * _bt_first. - vadim 05/05/97 + * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. + * - vadim 05/05/97 */ if (scankey && scan->numberOfKeys > 0) memmove(scan->keyData, @@ -593,21 +592,20 @@ btbulkdelete(PG_FUNCTION_ARGS) num_index_tuples = 0; /* - * The outer loop iterates over index leaf pages, the inner over items - * on a leaf page. We issue just one _bt_delitems() call per page, so - * as to minimize WAL traffic. + * The outer loop iterates over index leaf pages, the inner over items on + * a leaf page. We issue just one _bt_delitems() call per page, so as to + * minimize WAL traffic. * * Note that we exclusive-lock every leaf page containing data items, in - * sequence left to right. It sounds attractive to only - * exclusive-lock those containing items we need to delete, but - * unfortunately that is not safe: we could then pass a stopped - * indexscan, which could in rare cases lead to deleting the item it - * needs to find when it resumes. (See _bt_restscan --- this could - * only happen if an indexscan stops on a deletable item and then a - * page split moves that item into a page further to its right, which - * the indexscan will have no pin on.) We can skip obtaining - * exclusive lock on empty pages though, since no indexscan could be - * stopped on those. + * sequence left to right. It sounds attractive to only exclusive-lock + * those containing items we need to delete, but unfortunately that is not + * safe: we could then pass a stopped indexscan, which could in rare cases + * lead to deleting the item it needs to find when it resumes. (See + * _bt_restscan --- this could only happen if an indexscan stops on a + * deletable item and then a page split moves that item into a page + * further to its right, which the indexscan will have no pin on.) We can + * skip obtaining exclusive lock on empty pages though, since no indexscan + * could be stopped on those. */ buf = _bt_get_endpoint(rel, 0, false); if (BufferIsValid(buf)) /* check for empty index */ @@ -632,15 +630,15 @@ btbulkdelete(PG_FUNCTION_ARGS) if (minoff <= maxoff && !P_ISDELETED(opaque)) { /* - * Trade in the initial read lock for a super-exclusive - * write lock on this page. + * Trade in the initial read lock for a super-exclusive write + * lock on this page. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* - * Recompute minoff/maxoff, both of which could have - * changed while we weren't holding the lock. + * Recompute minoff/maxoff, both of which could have changed + * while we weren't holding the lock. */ minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -657,7 +655,7 @@ btbulkdelete(PG_FUNCTION_ARGS) ItemPointer htup; btitem = (BTItem) PageGetItem(page, - PageGetItemId(page, offnum)); + PageGetItemId(page, offnum)); htup = &(btitem->bti_itup.t_tid); if (callback(htup, callback_state)) { @@ -670,8 +668,8 @@ btbulkdelete(PG_FUNCTION_ARGS) } /* - * If we need to delete anything, do it and write the buffer; - * else just release the buffer. + * If we need to delete anything, do it and write the buffer; else + * just release the buffer. */ nextpage = opaque->btpo_next; if (ndeletable > 0) @@ -725,19 +723,19 @@ btvacuumcleanup(PG_FUNCTION_ARGS) Assert(stats != NULL); /* - * First find out the number of pages in the index. We must acquire - * the relation-extension lock while doing this to avoid a race - * condition: if someone else is extending the relation, there is - * a window where bufmgr/smgr have created a new all-zero page but - * it hasn't yet been write-locked by _bt_getbuf(). If we manage to - * scan such a page here, we'll improperly assume it can be recycled. - * Taking the lock synchronizes things enough to prevent a problem: - * either num_pages won't include the new page, or _bt_getbuf already - * has write lock on the buffer and it will be fully initialized before - * we can examine it. (See also vacuumlazy.c, which has the same issue.) + * First find out the number of pages in the index. We must acquire the + * relation-extension lock while doing this to avoid a race condition: if + * someone else is extending the relation, there is a window where + * bufmgr/smgr have created a new all-zero page but it hasn't yet been + * write-locked by _bt_getbuf(). If we manage to scan such a page here, + * we'll improperly assume it can be recycled. Taking the lock + * synchronizes things enough to prevent a problem: either num_pages won't + * include the new page, or _bt_getbuf already has write lock on the + * buffer and it will be fully initialized before we can examine it. (See + * also vacuumlazy.c, which has the same issue.) * - * We can skip locking for new or temp relations, - * however, since no one else could be accessing them. + * We can skip locking for new or temp relations, however, since no one else + * could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); @@ -807,12 +805,12 @@ btvacuumcleanup(PG_FUNCTION_ARGS) /* * During VACUUM FULL it's okay to recycle deleted pages - * immediately, since there can be no other transactions - * scanning the index. Note that we will only recycle the - * current page and not any parent pages that _bt_pagedel - * might have recursed to; this seems reasonable in the name - * of simplicity. (Trying to do otherwise would mean we'd - * have to sort the list of recyclable pages we're building.) + * immediately, since there can be no other transactions scanning + * the index. Note that we will only recycle the current page and + * not any parent pages that _bt_pagedel might have recursed to; + * this seems reasonable in the name of simplicity. (Trying to do + * otherwise would mean we'd have to sort the list of recyclable + * pages we're building.) */ if (ndel && info->vacuum_full) { @@ -827,10 +825,10 @@ btvacuumcleanup(PG_FUNCTION_ARGS) } /* - * During VACUUM FULL, we truncate off any recyclable pages at the end - * of the index. In a normal vacuum it'd be unsafe to do this except - * by acquiring exclusive lock on the index and then rechecking all - * the pages; doesn't seem worth it. + * During VACUUM FULL, we truncate off any recyclable pages at the end of + * the index. In a normal vacuum it'd be unsafe to do this except by + * acquiring exclusive lock on the index and then rechecking all the + * pages; doesn't seem worth it. */ if (info->vacuum_full && nFreePages > 0) { @@ -857,9 +855,9 @@ btvacuumcleanup(PG_FUNCTION_ARGS) } /* - * Update the shared Free Space Map with the info we now have about - * free pages in the index, discarding any old info the map may have. - * We do not need to sort the page numbers; they're in order already. + * Update the shared Free Space Map with the info we now have about free + * pages in the index, discarding any old info the map may have. We do not + * need to sort the page numbers; they're in order already. */ RecordIndexFreeSpace(&rel->rd_node, nFreePages, freePages); @@ -915,15 +913,15 @@ _bt_restscan(IndexScanDesc scan) opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* - * We use this as flag when first index tuple on page is deleted but - * we do not move left (this would slowdown vacuum) - so we set + * We use this as flag when first index tuple on page is deleted but we do + * not move left (this would slowdown vacuum) - so we set * current->ip_posid before first index tuple on the current page * (_bt_step will move it right)... XXX still needed? */ if (!ItemPointerIsValid(target)) { ItemPointerSetOffsetNumber(current, - OffsetNumberPrev(P_FIRSTDATAKEY(opaque))); + OffsetNumberPrev(P_FIRSTDATAKEY(opaque))); return; } @@ -948,12 +946,12 @@ _bt_restscan(IndexScanDesc scan) } /* - * The item we're looking for moved right at least one page, so - * move right. We are careful here to pin and read-lock the next - * non-dead page before releasing the current one. This ensures - * that a concurrent btbulkdelete scan cannot pass our position - * --- if it did, it might be able to reach and delete our target - * item before we can find it again. + * The item we're looking for moved right at least one page, so move + * right. We are careful here to pin and read-lock the next non-dead + * page before releasing the current one. This ensures that a + * concurrent btbulkdelete scan cannot pass our position --- if it + * did, it might be able to reach and delete our target item before we + * can find it again. */ if (P_RIGHTMOST(opaque)) elog(ERROR, "failed to re-find previous key in \"%s\"", diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index c029824fa6f..06075dd3dda 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.94 2005/10/06 02:29:12 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.95 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -69,9 +69,9 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, BTStack new_stack; /* - * Race -- the page we just grabbed may have split since we read - * its pointer in the parent (or metapage). If it has, we may - * need to move right to its new sibling. Do that. + * Race -- the page we just grabbed may have split since we read its + * pointer in the parent (or metapage). If it has, we may need to + * move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ); @@ -82,8 +82,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, break; /* - * Find the appropriate item on the internal page, and get the - * child page that it points to. + * Find the appropriate item on the internal page, and get the child + * page that it points to. */ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); itemid = PageGetItemId(page, offnum); @@ -94,13 +94,13 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, /* * We need to save the location of the index entry we chose in the - * parent page on a stack. In case we split the tree, we'll use - * the stack to work back up to the parent page. We also save the - * actual downlink (TID) to uniquely identify the index entry, in - * case it moves right while we're working lower in the tree. See - * the paper by Lehman and Yao for how this is detected and - * handled. (We use the child link to disambiguate duplicate keys - * in the index -- Lehman and Yao disallow duplicate keys.) + * parent page on a stack. In case we split the tree, we'll use the + * stack to work back up to the parent page. We also save the actual + * downlink (TID) to uniquely identify the index entry, in case it + * moves right while we're working lower in the tree. See the paper + * by Lehman and Yao for how this is detected and handled. (We use the + * child link to disambiguate duplicate keys in the index -- Lehman + * and Yao disallow duplicate keys.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; @@ -156,19 +156,18 @@ _bt_moveright(Relation rel, opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* - * When nextkey = false (normal case): if the scan key that brought us - * to this page is > the high key stored on the page, then the page - * has split and we need to move right. (If the scan key is equal to - * the high key, we might or might not need to move right; have to - * scan the page first anyway.) + * When nextkey = false (normal case): if the scan key that brought us to + * this page is > the high key stored on the page, then the page has split + * and we need to move right. (If the scan key is equal to the high key, + * we might or might not need to move right; have to scan the page first + * anyway.) * * When nextkey = true: move right if the scan key is >= page's high key. * - * The page could even have split more than once, so scan as far as - * needed. + * The page could even have split more than once, so scan as far as needed. * - * We also have to move right if we followed a link that brought us to a - * dead page. + * We also have to move right if we followed a link that brought us to a dead + * page. */ cmpval = nextkey ? 0 : 1; @@ -242,24 +241,24 @@ _bt_binsrch(Relation rel, high = PageGetMaxOffsetNumber(page); /* - * If there are no keys on the page, return the first available slot. - * Note this covers two cases: the page is really empty (no keys), or - * it contains only a high key. The latter case is possible after - * vacuuming. This can never happen on an internal page, however, - * since they are never empty (an internal page must have children). + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). */ if (high < low) return low; /* - * Binary search to find the first key on the page >= scan key, or - * first key > scankey when nextkey is true. + * Binary search to find the first key on the page >= scan key, or first + * key > scankey when nextkey is true. * * For nextkey=false (cmpval=1), the loop invariant is: all slots before * 'low' are < scan key, all slots at or after 'high' are >= scan key. * - * For nextkey=true (cmpval=0), the loop invariant is: all slots before - * 'low' are <= scan key, all slots at or after 'high' are > scan key. + * For nextkey=true (cmpval=0), the loop invariant is: all slots before 'low' + * are <= scan key, all slots at or after 'high' are > scan key. * * We can fall out when high == low. */ @@ -285,15 +284,15 @@ _bt_binsrch(Relation rel, * At this point we have high == low, but be careful: they could point * past the last slot on the page. * - * On a leaf page, we always return the first key >= scan key (resp. > - * scan key), which could be the last slot + 1. + * On a leaf page, we always return the first key >= scan key (resp. > scan + * key), which could be the last slot + 1. */ if (P_ISLEAF(opaque)) return low; /* - * On a non-leaf page, return the last key < scan key (resp. <= scan - * key). There must be one if _bt_compare() is playing by the rules. + * On a non-leaf page, return the last key < scan key (resp. <= scan key). + * There must be one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); @@ -337,8 +336,8 @@ _bt_compare(Relation rel, int i; /* - * Force result ">" if target item is first data item on an internal - * page --- see NOTE above. + * Force result ">" if target item is first data item on an internal page + * --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; @@ -347,15 +346,15 @@ _bt_compare(Relation rel, itup = &(btitem->bti_itup); /* - * The scan key is set up with the attribute number associated with - * each term in the key. It is important that, if the index is - * multi-key, the scan contain the first k key attributes, and that - * they be in order. If you think about how multi-key ordering works, - * you'll understand why this is. + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, the + * scan contain the first k key attributes, and that they be in order. If + * you think about how multi-key ordering works, you'll understand why + * this is. * - * We don't test for violation of this condition here, however. The - * initial setup for the index scan had better have gotten it right - * (see _bt_first). + * We don't test for violation of this condition here, however. The initial + * setup for the index scan had better have gotten it right (see + * _bt_first). */ for (i = 1; i <= keysz; i++) @@ -381,15 +380,15 @@ _bt_compare(Relation rel, else { /* - * The sk_func needs to be passed the index value as left arg - * and the sk_argument as right arg (they might be of - * different types). Since it is convenient for callers to - * think of _bt_compare as comparing the scankey to the index - * item, we have to flip the sign of the comparison result. + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. * - * Note: curious-looking coding is to avoid overflow if - * comparison function returns INT_MIN. There is no risk of - * overflow for positive results. + * Note: curious-looking coding is to avoid overflow if comparison + * function returns INT_MIN. There is no risk of overflow for + * positive results. */ result = DatumGetInt32(FunctionCall2(&scankey->sk_func, datum, @@ -497,7 +496,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) bool goback; bool continuescan; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData scankeys[INDEX_MAX_KEYS]; + ScanKeyData scankeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; StrategyNumber strat_total; @@ -505,8 +504,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) pgstat_count_index_scan(&scan->xs_pgstat_info); /* - * Examine the scan keys and eliminate any redundant keys; also - * discover how many keys must be matched to continue the scan. + * Examine the scan keys and eliminate any redundant keys; also discover + * how many keys must be matched to continue the scan. */ _bt_preprocess_keys(scan); @@ -556,9 +555,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKey cur; /* - * chosen is the so-far-chosen key for the current attribute, if - * any. We don't cast the decision in stone until we reach keys - * for the next attribute. + * chosen is the so-far-chosen key for the current attribute, if any. + * We don't cast the decision in stone until we reach keys for the + * next attribute. */ curattr = 1; chosen = NULL; @@ -595,9 +594,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * Done if that was the last attribute, or if next key - * is not in sequence (implying no boundary key is available - * for the next attribute). + * Done if that was the last attribute, or if next key is not + * in sequence (implying no boundary key is available for the + * next attribute). */ if (i >= so->numberOfKeys || cur->sk_attno != curattr + 1) @@ -632,17 +631,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * If we found no usable boundary keys, we have to start from one end - * of the tree. Walk down that edge to the first or last key, and - * scan from there. + * If we found no usable boundary keys, we have to start from one end of + * the tree. Walk down that edge to the first or last key, and scan from + * there. */ if (keysCount == 0) return _bt_endpoint(scan, dir); /* * We want to start the scan somewhere within the index. Set up a - * 3-way-comparison scankey we can use to search for the boundary - * point we identified above. + * 3-way-comparison scankey we can use to search for the boundary point we + * identified above. */ Assert(keysCount <= INDEX_MAX_KEYS); for (i = 0; i < keysCount; i++) @@ -650,16 +649,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKey cur = startKeys[i]; /* - * _bt_preprocess_keys disallows it, but it's place to add some - * code later + * _bt_preprocess_keys disallows it, but it's place to add some code + * later */ if (cur->sk_flags & SK_ISNULL) elog(ERROR, "btree doesn't support is(not)null, yet"); /* - * If scankey operator is of default subtype, we can use the - * cached comparison procedure; otherwise gotta look it up in the - * catalogs. + * If scankey operator is of default subtype, we can use the cached + * comparison procedure; otherwise gotta look it up in the catalogs. */ if (cur->sk_subtype == InvalidOid) { @@ -692,13 +690,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * Examine the selected initial-positioning strategy to determine - * exactly where we need to start the scan, and set flag variables to - * control the code below. + * Examine the selected initial-positioning strategy to determine exactly + * where we need to start the scan, and set flag variables to control the + * code below. * - * If nextkey = false, _bt_search and _bt_binsrch will locate the first - * item >= scan key. If nextkey = true, they will locate the first - * item > scan key. + * If nextkey = false, _bt_search and _bt_binsrch will locate the first item + * >= scan key. If nextkey = true, they will locate the first item > scan + * key. * * If goback = true, we will then step back one item, while if goback = * false, we will start the scan on the located item. @@ -710,10 +708,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTLessStrategyNumber: /* - * Find first item >= scankey, then back up one to arrive at - * last item < scankey. (Note: this positioning strategy is - * only used for a backward scan, so that is always the - * correct starting position.) + * Find first item >= scankey, then back up one to arrive at last + * item < scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) */ nextkey = false; goback = true; @@ -722,10 +720,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTLessEqualStrategyNumber: /* - * Find first item > scankey, then back up one to arrive at - * last item <= scankey. (Note: this positioning strategy is - * only used for a backward scan, so that is always the - * correct starting position.) + * Find first item > scankey, then back up one to arrive at last + * item <= scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) */ nextkey = true; goback = true; @@ -734,14 +732,14 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTEqualStrategyNumber: /* - * If a backward scan was specified, need to start with last - * equal item not first one. + * If a backward scan was specified, need to start with last equal + * item not first one. */ if (ScanDirectionIsBackward(dir)) { /* - * This is the same as the <= strategy. We will check at - * the end whether the found item is actually =. + * This is the same as the <= strategy. We will check at the + * end whether the found item is actually =. */ nextkey = true; goback = true; @@ -749,8 +747,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) else { /* - * This is the same as the >= strategy. We will check at - * the end whether the found item is actually =. + * This is the same as the >= strategy. We will check at the + * end whether the found item is actually =. */ nextkey = false; goback = false; @@ -813,24 +811,24 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ItemPointerSet(current, blkno, offnum); /* - * If nextkey = false, we are positioned at the first item >= scan - * key, or possibly at the end of a page on which all the existing - * items are less than the scan key and we know that everything on - * later pages is greater than or equal to scan key. + * If nextkey = false, we are positioned at the first item >= scan key, or + * possibly at the end of a page on which all the existing items are less + * than the scan key and we know that everything on later pages is greater + * than or equal to scan key. * * If nextkey = true, we are positioned at the first item > scan key, or - * possibly at the end of a page on which all the existing items are - * less than or equal to the scan key and we know that everything on - * later pages is greater than scan key. + * possibly at the end of a page on which all the existing items are less + * than or equal to the scan key and we know that everything on later + * pages is greater than scan key. * - * The actually desired starting point is either this item or the prior - * one, or in the end-of-page case it's the first item on the next - * page or the last item on this page. We apply _bt_step if needed to - * get to the right place. + * The actually desired starting point is either this item or the prior one, + * or in the end-of-page case it's the first item on the next page or the + * last item on this page. We apply _bt_step if needed to get to the + * right place. * * If _bt_step fails (meaning we fell off the end of the index in one - * direction or the other), then there are no matches so we just - * return false. + * direction or the other), then there are no matches so we just return + * false. */ if (goback) { @@ -902,8 +900,8 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) BlockNumber blkno; /* - * Don't use ItemPointerGetOffsetNumber or you risk to get assertion - * due to ability of ip_posid to be equal 0. + * Don't use ItemPointerGetOffsetNumber or you risk to get assertion due + * to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; @@ -954,9 +952,9 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* * Walk left to the next page with data. This is much more * complex than the walk-right case because of the possibility - * that the page to our left splits while we are in flight to - * it, plus the possibility that the page we were on gets - * deleted after we leave it. See nbtree/README for details. + * that the page to our left splits while we are in flight to it, + * plus the possibility that the page we were on gets deleted + * after we leave it. See nbtree/README for details. */ for (;;) { @@ -973,9 +971,9 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* - * Okay, we managed to move left to a non-deleted page. - * Done if it's not half-dead and not empty. Else loop - * back and do it all again. + * Okay, we managed to move left to a non-deleted page. Done + * if it's not half-dead and not empty. Else loop back and do + * it all again. */ if (!P_IGNORE(opaque)) { @@ -1043,15 +1041,14 @@ _bt_walk_left(Relation rel, Buffer buf) /* * If this isn't the page we want, walk right till we find what we - * want --- but go no more than four hops (an arbitrary limit). If - * we don't find the correct page by then, the most likely bet is - * that the original page got deleted and isn't in the sibling - * chain at all anymore, not that its left sibling got split more - * than four times. + * want --- but go no more than four hops (an arbitrary limit). If we + * don't find the correct page by then, the most likely bet is that + * the original page got deleted and isn't in the sibling chain at all + * anymore, not that its left sibling got split more than four times. * - * Note that it is correct to test P_ISDELETED not P_IGNORE here, - * because half-dead pages are still in the sibling chain. Caller - * must reject half-dead pages if wanted. + * Note that it is correct to test P_ISDELETED not P_IGNORE here, because + * half-dead pages are still in the sibling chain. Caller must reject + * half-dead pages if wanted. */ tries = 0; for (;;) @@ -1077,9 +1074,9 @@ _bt_walk_left(Relation rel, Buffer buf) { /* * It was deleted. Move right to first nondeleted page (there - * must be one); that is the page that has acquired the - * deleted one's keyspace, so stepping left from it will take - * us where we want to be. + * must be one); that is the page that has acquired the deleted + * one's keyspace, so stepping left from it will take us where we + * want to be. */ for (;;) { @@ -1095,16 +1092,16 @@ _bt_walk_left(Relation rel, Buffer buf) } /* - * Now return to top of loop, resetting obknum to point to - * this nondeleted page, and try again. + * Now return to top of loop, resetting obknum to point to this + * nondeleted page, and try again. */ } else { /* - * It wasn't deleted; the explanation had better be that the - * page to the left got split or deleted. Without this check, - * we'd go into an infinite loop if there's anything wrong. + * It wasn't deleted; the explanation had better be that the page + * to the left got split or deleted. Without this check, we'd go + * into an infinite loop if there's anything wrong. */ if (opaque->btpo_prev == lblkno) elog(ERROR, "could not find left sibling in \"%s\"", @@ -1137,8 +1134,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) /* * If we are looking for a leaf page, okay to descend from fast root; - * otherwise better descend from true root. (There is no point in - * being smarter about intermediate levels.) + * otherwise better descend from true root. (There is no point in being + * smarter about intermediate levels.) */ if (level == 0) buf = _bt_getroot(rel, BT_READ); @@ -1159,8 +1156,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) /* * If we landed on a deleted page, step right to find a live page * (there must be one). Also, if we want the rightmost page, step - * right if needed to get to it (this could happen if the page - * split since we obtained a pointer to it). + * right if needed to get to it (this could happen if the page split + * since we obtained a pointer to it). */ while (P_IGNORE(opaque) || (rightmost && !P_RIGHTMOST(opaque))) @@ -1228,9 +1225,9 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) so = (BTScanOpaque) scan->opaque; /* - * Scan down to the leftmost or rightmost leaf page. This is a - * simplified version of _bt_search(). We don't maintain a stack - * since we know we won't need it. + * Scan down to the leftmost or rightmost leaf page. This is a simplified + * version of _bt_search(). We don't maintain a stack since we know we + * won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); @@ -1261,8 +1258,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); - if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty - * page */ + if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */ start = P_FIRSTDATAKEY(opaque); } else @@ -1276,8 +1272,8 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) so->btso_curbuf = buf; /* - * Left/rightmost page could be empty due to deletions, if so step - * till we find a nonempty page. + * Left/rightmost page could be empty due to deletions, if so step till we + * find a nonempty page. */ if (start > maxoff) { @@ -1291,8 +1287,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) itup = &(btitem->bti_itup); /* - * Okay, we are on the first or last tuple. Does it pass all the - * quals? + * Okay, we are on the first or last tuple. Does it pass all the quals? */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index ee5acee5c3e..6ee5d42b63a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -56,7 +56,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.94 2005/08/11 13:22:33 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.95 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -99,12 +99,10 @@ typedef struct BTPageState { Page btps_page; /* workspace for page building */ BlockNumber btps_blkno; /* block # to write this page at */ - BTItem btps_minkey; /* copy of minimum key (first item) on - * page */ + BTItem btps_minkey; /* copy of minimum key (first item) on page */ OffsetNumber btps_lastoff; /* last item offset loaded */ uint32 btps_level; /* tree level (0 = leaf) */ - Size btps_full; /* "full" if less than this much free - * space */ + Size btps_full; /* "full" if less than this much free space */ struct BTPageState *btps_next; /* link to parent level, if any */ } BTPageState; @@ -157,21 +155,21 @@ _bt_spoolinit(Relation index, bool isunique, bool isdead) btspool->isunique = isunique; /* - * We size the sort area as maintenance_work_mem rather than work_mem - * to speed index creation. This should be OK since a single backend - * can't run multiple index creations in parallel. Note that creation - * of a unique index actually requires two BTSpool objects. We expect - * that the second one (for dead tuples) won't get very full, so we - * give it only work_mem. + * We size the sort area as maintenance_work_mem rather than work_mem to + * speed index creation. This should be OK since a single backend can't + * run multiple index creations in parallel. Note that creation of a + * unique index actually requires two BTSpool objects. We expect that the + * second one (for dead tuples) won't get very full, so we give it only + * work_mem. */ btKbytes = isdead ? work_mem : maintenance_work_mem; btspool->sortstate = tuplesort_begin_index(index, isunique, btKbytes, false); /* - * Currently, tuplesort provides sort functions on IndexTuples. If we - * kept anything in a BTItem other than a regular IndexTuple, we'd - * need to modify tuplesort to understand BTItems as such. + * Currently, tuplesort provides sort functions on IndexTuples. If we kept + * anything in a BTItem other than a regular IndexTuple, we'd need to + * modify tuplesort to understand BTItems as such. */ Assert(sizeof(BTItemData) == sizeof(IndexTupleData)); @@ -222,8 +220,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.index = btspool->index; /* - * We need to log index creation in WAL iff WAL archiving is enabled - * AND it's not a temp index. + * We need to log index creation in WAL iff WAL archiving is enabled AND + * it's not a temp index. */ wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp; @@ -313,9 +311,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) /* * If we have to write pages nonsequentially, fill in the space with * zeroes until we come back and overwrite. This is not logically - * necessary on standard Unix filesystems (unwritten space will read - * as zeroes anyway), but it should help to avoid fragmentation. The - * dummy pages aren't WAL-logged though. + * necessary on standard Unix filesystems (unwritten space will read as + * zeroes anyway), but it should help to avoid fragmentation. The dummy + * pages aren't WAL-logged though. */ while (blkno > wstate->btws_pages_written) { @@ -328,8 +326,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) /* * Now write the page. We say isTemp = true even if it's not a temp - * index, because there's no need for smgr to schedule an fsync for - * this write; we'll do it ourselves before ending the build. + * index, because there's no need for smgr to schedule an fsync for this + * write; we'll do it ourselves before ending the build. */ smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true); @@ -483,15 +481,15 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) btisz = MAXALIGN(btisz); /* - * Check whether the item can fit on a btree page at all. (Eventually, - * we ought to try to apply TOAST methods if not.) We actually need to - * be able to fit three items on every page, so restrict any one item - * to 1/3 the per-page available space. Note that at this point, btisz - * doesn't include the ItemId. + * Check whether the item can fit on a btree page at all. (Eventually, we + * ought to try to apply TOAST methods if not.) We actually need to be + * able to fit three items on every page, so restrict any one item to 1/3 + * the per-page available space. Note that at this point, btisz doesn't + * include the ItemId. * - * NOTE: similar code appears in _bt_insertonpg() to defend against - * oversize items being inserted into an already-existing index. But - * during creation of an index, we don't go through there. + * NOTE: similar code appears in _bt_insertonpg() to defend against oversize + * items being inserted into an already-existing index. But during + * creation of an index, we don't go through there. */ if (btisz > BTMaxItemSize(npage)) ereport(ERROR, @@ -499,9 +497,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) errmsg("index row size %lu exceeds btree maximum, %lu", (unsigned long) btisz, (unsigned long) BTMaxItemSize(npage)), - errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" - "Consider a function index of an MD5 hash of the value, " - "or use full text indexing."))); + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."))); if (pgspc < btisz || pgspc < state->btps_full) { @@ -523,11 +521,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) /* * We copy the last item on the page into the new page, and then - * rearrange the old page so that the 'last item' becomes its high - * key rather than a true data item. There had better be at least - * two items on the page already, else the page would be empty of - * useful data. (Hence, we must allow pages to be packed at least - * 2/3rds full; the 70% figure used above is close to minimum.) + * rearrange the old page so that the 'last item' becomes its high key + * rather than a true data item. There had better be at least two + * items on the page already, else the page would be empty of useful + * data. (Hence, we must allow pages to be packed at least 2/3rds + * full; the 70% figure used above is close to minimum.) */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); @@ -544,8 +542,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) /* * Link the old page into its parent, using its minimum key. If we - * don't have a parent, we have to create one; this adds a new - * btree level. + * don't have a parent, we have to create one; this adds a new btree + * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); @@ -557,9 +555,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) pfree(state->btps_minkey); /* - * Save a copy of the minimum key for the new page. We have to - * copy it off the old page, not the new one, in case we are not - * at leaf level. + * Save a copy of the minimum key for the new page. We have to copy + * it off the old page, not the new one, in case we are not at leaf + * level. */ state->btps_minkey = _bt_formitem(&(obti->bti_itup)); @@ -576,8 +574,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) } /* - * Write out the old page. We never need to touch it again, so we - * can free the opage workspace too. + * Write out the old page. We never need to touch it again, so we can + * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); @@ -588,10 +586,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti) } /* - * If the new item is the first for its page, stash a copy for later. - * Note this will only happen for the first item on a level; on later - * pages, the first item for a page is copied from the prior page in - * the code above. + * If the new item is the first for its page, stash a copy for later. Note + * this will only happen for the first item on a level; on later pages, + * the first item for a page is copied from the prior page in the code + * above. */ if (last_off == P_HIKEY) { @@ -636,9 +634,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * We have to link the last page on this level to somewhere. * * If we're at the top, it's the root, so attach it to the metapage. - * Otherwise, add an entry for it to its parent using its minimum - * key. This may cause the last page of the parent level to - * split, but that's not a problem -- we haven't gotten to it yet. + * Otherwise, add an entry for it to its parent using its minimum key. + * This may cause the last page of the parent level to split, but + * that's not a problem -- we haven't gotten to it yet. */ if (s->btps_next == NULL) { @@ -657,8 +655,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) } /* - * This is the rightmost page, so the ItemId array needs to be - * slid back one slot. Then we can dump out the page. + * This is the rightmost page, so the ItemId array needs to be slid + * back one slot. Then we can dump out the page. */ _bt_slideleft(s->btps_page); _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); @@ -667,9 +665,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) /* * As the last step in the process, construct the metapage and make it - * point to the new root (unless we had no data at all, in which case - * it's set to point to "P_NONE"). This changes the index to the - * "valid" state by filling in a valid magic number in the metapage. + * point to the new root (unless we had no data at all, in which case it's + * set to point to "P_NONE"). This changes the index to the "valid" state + * by filling in a valid magic number in the metapage. */ metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, rootblkno, rootlevel); @@ -748,7 +746,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) compare = DatumGetInt32(FunctionCall2(&entry->sk_func, attrDatum1, - attrDatum2)); + attrDatum2)); if (compare > 0) { load1 = false; @@ -772,7 +770,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (should_free) pfree(bti); bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, - true, &should_free); + true, &should_free); } else { @@ -780,7 +778,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (should_free2) pfree(bti2); bti2 = (BTItem) tuplesort_getindextuple(btspool2->sortstate, - true, &should_free2); + true, &should_free2); } } _bt_freeskey(indexScanKey); @@ -789,7 +787,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { /* merge is unnecessary */ while ((bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, - true, &should_free)) != NULL) + true, &should_free)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) @@ -805,19 +803,19 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * If the index isn't temp, we must fsync it down to disk before it's - * safe to commit the transaction. (For a temp index we don't care - * since the index will be uninteresting after a crash anyway.) + * If the index isn't temp, we must fsync it down to disk before it's safe + * to commit the transaction. (For a temp index we don't care since the + * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index - * pages. The reason is that since we're building outside shared - * buffers, a CHECKPOINT occurring during the build has no way to - * flush the previously written data to disk (indeed it won't know the - * index even exists). A crash later on would replay WAL from the - * checkpoint, therefore it wouldn't replay our earlier WAL entries. - * If we do not fsync those pages here, they might still not be on - * disk when the crash occurs. + * pages. The reason is that since we're building outside shared buffers, + * a CHECKPOINT occurring during the build has no way to flush the + * previously written data to disk (indeed it won't know the index even + * exists). A crash later on would replay WAL from the checkpoint, + * therefore it wouldn't replay our earlier WAL entries. If we do not + * fsync those pages here, they might still not be on disk when the crash + * occurs. */ if (!wstate->index->rd_istemp) smgrimmedsync(wstate->index->rd_smgr); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 9a5f8d7ac90..269213d21f7 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.63 2005/06/13 23:14:48 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.64 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -48,8 +48,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup) bool null; /* - * We can use the cached (default) support procs since no - * cross-type comparison can be needed. + * We can use the cached (default) support procs since no cross-type + * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); arg = index_getattr(itup, i + 1, itupdesc, &null); @@ -93,8 +93,8 @@ _bt_mkscankey_nodata(Relation rel) FmgrInfo *procinfo; /* - * We can use the cached (default) support procs since no - * cross-type comparison can be needed. + * We can use the cached (default) support procs since no cross-type + * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); ScanKeyEntryInitializeWithInfo(&skey[i], @@ -257,9 +257,9 @@ _bt_preprocess_keys(IndexScanDesc scan) if (numberOfKeys == 1) { /* - * We don't use indices for 'A is null' and 'A is not null' - * currently and 'A < = > <> NULL' will always fail - so qual is - * not OK if comparison value is NULL. - vadim 03/21/97 + * We don't use indices for 'A is null' and 'A is not null' currently + * and 'A < = > <> NULL' will always fail - so qual is not OK if + * comparison value is NULL. - vadim 03/21/97 */ if (cur->sk_flags & SK_ISNULL) so->qual_ok = false; @@ -286,20 +286,20 @@ _bt_preprocess_keys(IndexScanDesc scan) /* * Initialize for processing of keys for attr 1. * - * xform[i] points to the currently best scan key of strategy type i+1, - * if any is found with a default operator subtype; it is NULL if we - * haven't yet found such a key for this attr. Scan keys of - * nondefault subtypes are transferred to the output with no - * processing except for noting if they are of "=" type. + * xform[i] points to the currently best scan key of strategy type i+1, if + * any is found with a default operator subtype; it is NULL if we haven't + * yet found such a key for this attr. Scan keys of nondefault subtypes + * are transferred to the output with no processing except for noting if + * they are of "=" type. */ attno = 1; memset(xform, 0, sizeof(xform)); hasOtherTypeEqual = false; /* - * Loop iterates from 0 to numberOfKeys inclusive; we use the last - * pass to handle after-last-key processing. Actual exit from the - * loop is at the "break" statement below. + * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to + * handle after-last-key processing. Actual exit from the loop is at the + * "break" statement below. */ for (i = 0;; cur++, i++) { @@ -319,8 +319,8 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* - * If we are at the end of the keys for a particular attr, finish - * up processing and emit the cleaned-up keys. + * If we are at the end of the keys for a particular attr, finish up + * processing and emit the cleaned-up keys. */ if (i == numberOfKeys || cur->sk_attno != attno) { @@ -331,9 +331,9 @@ _bt_preprocess_keys(IndexScanDesc scan) elog(ERROR, "btree index keys must be ordered by attribute"); /* - * If = has been specified, no other key will be used. In case - * of key > 2 && key == 1 and so on we have to set qual_ok to - * false before discarding the other keys. + * If = has been specified, no other key will be used. In case of + * key > 2 && key == 1 and so on we have to set qual_ok to false + * before discarding the other keys. */ if (xform[BTEqualStrategyNumber - 1]) { @@ -411,8 +411,8 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* - * If all attrs before this one had "=", include these keys - * into the required-keys count. + * If all attrs before this one had "=", include these keys into + * the required-keys count. */ if (priorNumberOfEqualCols == attno - 1) so->numberOfRequiredKeys = new_numberOfKeys; @@ -526,11 +526,11 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, if (isNull) { /* - * Since NULLs are sorted after non-NULLs, we know we have - * reached the upper limit of the range of values for this - * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. On a backward scan, - * however, we should keep going. + * Since NULLs are sorted after non-NULLs, we know we have reached + * the upper limit of the range of values for this index attr. On + * a forward scan, we can stop if this qual is one of the "must + * match" subset. On a backward scan, however, we should keep + * going. */ if (ikey < so->numberOfRequiredKeys && ScanDirectionIsForward(dir)) @@ -547,24 +547,22 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, if (!DatumGetBool(test)) { /* - * Tuple fails this qual. If it's a required qual, then we - * may be able to conclude no further tuples will pass, - * either. We have to look at the scan direction and the qual - * type. + * Tuple fails this qual. If it's a required qual, then we may be + * able to conclude no further tuples will pass, either. We have + * to look at the scan direction and the qual type. * - * Note: the only case in which we would keep going after failing - * a required qual is if there are partially-redundant quals - * that _bt_preprocess_keys() was unable to eliminate. For - * example, given "x > 4 AND x > 10" where both are cross-type - * comparisons and so not removable, we might start the scan - * at the x = 4 boundary point. The "x > 10" condition will - * fail until we pass x = 10, but we must not stop the scan on - * its account. + * Note: the only case in which we would keep going after failing a + * required qual is if there are partially-redundant quals that + * _bt_preprocess_keys() was unable to eliminate. For example, + * given "x > 4 AND x > 10" where both are cross-type comparisons + * and so not removable, we might start the scan at the x = 4 + * boundary point. The "x > 10" condition will fail until we pass + * x = 10, but we must not stop the scan on its account. * - * Note: because we stop the scan as soon as any required - * equality qual fails, it is critical that equality quals be - * used for the initial positioning in _bt_first() when they - * are available. See comments in _bt_first(). + * Note: because we stop the scan as soon as any required equality + * qual fails, it is critical that equality quals be used for the + * initial positioning in _bt_first() when they are available. See + * comments in _bt_first(). */ if (ikey < so->numberOfRequiredKeys) { diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 078d8529241..61bf93a904b 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.22 2005/06/06 17:01:22 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.23 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -101,7 +101,7 @@ _bt_restore_page(Page page, char *from, int len) (sizeof(BTItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageAddItem(page, (Item) from, itemsz, - FirstOffsetNumber, LP_USED) == InvalidOffsetNumber) + FirstOffsetNumber, LP_USED) == InvalidOffsetNumber) elog(PANIC, "_bt_restore_page: can't add item to page"); from += itemsz; } @@ -136,8 +136,8 @@ _bt_restore_meta(Relation reln, XLogRecPtr lsn, pageop->btpo_flags = BTP_META; /* - * Set pd_lower just past the end of the metadata. This is not - * essential but it makes the page look compressible to xlog.c. + * Set pd_lower just past the end of the metadata. This is not essential + * but it makes the page look compressible to xlog.c. */ ((PageHeader) metapg)->pd_lower = ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg; @@ -181,7 +181,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, if (!(record->xl_info & XLR_BKP_BLOCK_1)) { buffer = XLogReadBuffer(false, reln, - ItemPointerGetBlockNumber(&(xlrec->target.tid))); + ItemPointerGetBlockNumber(&(xlrec->target.tid))); if (!BufferIsValid(buffer)) elog(PANIC, "btree_insert_redo: block unfound"); page = (Page) BufferGetPage(buffer); @@ -217,8 +217,8 @@ btree_xlog_insert(bool isleaf, bool ismeta, if (!isleaf && incomplete_splits != NIL) { forget_matching_split(reln, xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - ItemPointerGetOffsetNumber(&(xlrec->target.tid)), + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + ItemPointerGetOffsetNumber(&(xlrec->target.tid)), false); } } @@ -325,8 +325,8 @@ btree_xlog_split(bool onleft, bool isroot, if (xlrec->level > 0 && incomplete_splits != NIL) { forget_matching_split(reln, xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - ItemPointerGetOffsetNumber(&(xlrec->target.tid)), + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + ItemPointerGetOffsetNumber(&(xlrec->target.tid)), false); } @@ -655,7 +655,7 @@ static void out_target(char *buf, xl_btreetid *target) { sprintf(buf + strlen(buf), "rel %u/%u/%u; tid %u/%u", - target->node.spcNode, target->node.dbNode, target->node.relNode, + target->node.spcNode, target->node.dbNode, target->node.relNode, ItemPointerGetBlockNumber(&(target->tid)), ItemPointerGetOffsetNumber(&(target->tid))); } |