aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/nbtree/nbtinsert.c12
-rw-r--r--src/backend/access/nbtree/nbtpage.c4
-rw-r--r--src/backend/access/nbtree/nbtsearch.c289
-rw-r--r--src/include/access/nbtree.h11
4 files changed, 187 insertions, 129 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index ca3b634b07b..b2300646095 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.109 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.110 2003/12/21 01:23:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -86,8 +86,8 @@ _bt_doinsert(Relation rel, BTItem btitem,
itup_scankey = _bt_mkscankey(rel, itup);
top:
- /* find the page containing this key */
- stack = _bt_search(rel, natts, itup_scankey, &buf, BT_WRITE);
+ /* find the first page containing this key */
+ stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
/* trade in our read lock for a write lock */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
@@ -100,7 +100,7 @@ top:
* need to move right in the tree. See Lehman and Yao for an
* excruciatingly precise description.
*/
- buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
+ buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
/*
* If we're not allowing duplicates, make sure the key isn't already
@@ -175,7 +175,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
* Find first item >= proposed new item. Note we could also get a
* pointer to end-of-page here.
*/
- offset = _bt_binsrch(rel, buf, natts, itup_scankey);
+ offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
/*
* Scan over all equal tuples, looking for live conflicts.
@@ -478,7 +478,7 @@ _bt_insertonpg(Relation rel,
if (movedright)
newitemoff = P_FIRSTDATAKEY(lpageop);
else
- newitemoff = _bt_binsrch(rel, buf, keysz, scankey);
+ newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
}
/*
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 4004f02f02d..bc0ed6641f7 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.73 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.74 2003/12/21 01:23:06 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -804,7 +804,7 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
/* we need a scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup));
/* find the leftmost leaf page containing this key */
- stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey,
+ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
&lbuf, BT_READ);
/* don't need a pin on that either */
_bt_relbuf(rel, lbuf);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 87d5d0a22a8..554959fc79e 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.83 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.84 2003/12/21 01:23:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -28,6 +28,10 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* _bt_search() -- Search the tree for a particular scankey,
* or more precisely for the first leaf page it could be on.
*
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
* Return value is a stack of parent-page pointers. *bufP is set to the
* address of the leaf-page buffer, which is read-locked and pinned.
* No locks are held on the parent pages, however!
@@ -38,7 +42,7 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* will result in *bufP being set to InvalidBuffer.
*/
BTStack
-_bt_search(Relation rel, int keysz, ScanKey scankey,
+_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
Buffer *bufP, int access)
{
BTStack stack_in = NULL;
@@ -68,7 +72,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
* its pointer in the parent (or metapage). If it has, we may
* need to move right to its new sibling. Do that.
*/
- *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
@@ -80,7 +84,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
* Find the appropriate item on the internal page, and get the
* child page that it points to.
*/
- offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
+ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
itemid = PageGetItemId(page, offnum);
btitem = (BTItem) PageGetItem(page, itemid);
itup = &(btitem->bti_itup);
@@ -117,47 +121,59 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
/*
* _bt_moveright() -- move right in the btree if necessary.
*
- * When we follow a pointer to reach a page, it is possible that
- * the page has changed in the meanwhile. If this happens, we're
- * guaranteed that the page has "split right" -- that is, that any
- * data that appeared on the page originally is either on the page
- * or strictly to the right of it.
+ * When we follow a pointer to reach a page, it is possible that
+ * the page has changed in the meanwhile. If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
*
- * This routine decides whether or not we need to move right in the
- * tree by examining the high key entry on the page. If that entry
- * is strictly less than one we expect to be on the page, then our
- * picture of the page is incorrect and we need to move right.
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page. If that entry
+ * is strictly less than the scankey, or <= the scankey in the nextkey=true
+ * case, then we followed the wrong link and we need to move right.
*
- * On entry, we have the buffer pinned and a lock of the proper type.
- * If we move right, we release the buffer and lock and acquire the
- * same on the right sibling. Return value is the buffer we stop at.
+ * On entry, we have the buffer pinned and a lock of the type specified by
+ * 'access'. If we move right, we release the buffer and lock and acquire
+ * the same on the right sibling. Return value is the buffer we stop at.
*/
Buffer
_bt_moveright(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
+ bool nextkey,
int access)
{
Page page;
BTPageOpaque opaque;
+ int32 cmpval;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
- * If the scan key that brought us to this page is > the high key
- * stored on the page, then the page has split and we need to move
- * right. (If the scan key is equal to the high key, we might or
- * might not need to move right; have to scan the page first anyway.)
- * It could even have split more than once, so scan as far as needed.
+ * When nextkey = false (normal case): if the scan key that brought us to
+ * this page is > the high key stored on the page, then the page has split
+ * and we need to move right. (If the scan key is equal to the high key,
+ * we might or might not need to move right; have to scan the page first
+ * anyway.)
+ *
+ * When nextkey = true: move right if the scan key is >= page's high key.
+ *
+ * The page could even have split more than once, so scan as far as needed.
*
* We also have to move right if we followed a link that brought us to a
* dead page.
*/
+ cmpval = nextkey ? 0 : 1;
+
while (!P_RIGHTMOST(opaque) &&
(P_IGNORE(opaque) ||
- _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0))
+ _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
{
/* step right one page */
BlockNumber rblkno = opaque->btpo_next;
@@ -178,21 +194,26 @@ _bt_moveright(Relation rel,
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
* The scankey we get has the compare function stored in the procedure
* entry of each data struct. We invoke this regproc to do the
* comparison for every key in the scankey.
*
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
- * key >= given scankey. (NOTE: in particular, this means it is possible
- * to return a value 1 greater than the number of keys on the page,
- * if the scankey is > all keys on the page.)
+ * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
+ * particular, this means it is possible to return a value 1 greater than the
+ * number of keys on the page, if the scankey is > all keys on the page.)
*
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
- * of the last key < given scankey. (Since _bt_compare treats the first
- * data key of such a page as minus infinity, there will be at least one
- * key < scankey, so the result always points at one of the keys on the
- * page.) This key indicates the right place to descend to be sure we
- * find all leaf keys >= given scankey.
+ * of the last key < given scankey, or last key <= given scankey if nextkey
+ * is true. (Since _bt_compare treats the first data key of such a page as
+ * minus infinity, there will be at least one key < scankey, so the result
+ * always points at one of the keys on the page.) This key indicates the
+ * right place to descend to be sure we find all leaf keys >= given scankey
+ * (or leaf keys > given scankey when nextkey is true).
*
* This procedure is not responsible for walking right, it just examines
* the given page. _bt_binsrch() has no lock or refcount side effects
@@ -202,14 +223,16 @@ OffsetNumber
_bt_binsrch(Relation rel,
Buffer buf,
int keysz,
- ScanKey scankey)
+ ScanKey scankey,
+ bool nextkey)
{
TupleDesc itupdesc;
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
- int32 result;
+ int32 result,
+ cmpval;
itupdesc = RelationGetDescr(rel);
page = BufferGetPage(buf);
@@ -229,12 +252,23 @@ _bt_binsrch(Relation rel,
return low;
/*
- * Binary search to find the first key on the page >= scan key. Loop
- * invariant: all slots before 'low' are < scan key, all slots at or
- * after 'high' are >= scan key. We can fall out when high == low.
+ * Binary search to find the first key on the page >= scan key, or
+ * first key > scankey when nextkey is true.
+ *
+ * For nextkey=false (cmpval=1), the loop invariant is: all slots
+ * before 'low' are < scan key, all slots at or after 'high'
+ * are >= scan key.
+ *
+ * For nextkey=true (cmpval=0), the loop invariant is: all slots
+ * before 'low' are <= scan key, all slots at or after 'high'
+ * are > scan key.
+ *
+ * We can fall out when high == low.
*/
high++; /* establish the loop invariant for high */
+ cmpval = nextkey ? 0 : 1; /* select comparison value */
+
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
@@ -243,7 +277,7 @@ _bt_binsrch(Relation rel,
result = _bt_compare(rel, keysz, scankey, page, mid);
- if (result > 0)
+ if (result >= cmpval)
low = mid + 1;
else
high = mid;
@@ -253,15 +287,15 @@ _bt_binsrch(Relation rel,
* At this point we have high == low, but be careful: they could point
* past the last slot on the page.
*
- * On a leaf page, we always return the first key >= scan key (which
- * could be the last slot + 1).
+ * On a leaf page, we always return the first key >= scan key (resp.
+ * > scan key), which could be the last slot + 1.
*/
if (P_ISLEAF(opaque))
return low;
/*
- * On a non-leaf page, return the last key < scan key. There must be
- * one if _bt_compare() is playing by the rules.
+ * On a non-leaf page, return the last key < scan key (resp. <= scan key).
+ * There must be one if _bt_compare() is playing by the rules.
*/
Assert(low > P_FIRSTDATAKEY(opaque));
@@ -462,6 +496,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
StrategyNumber strat;
bool res;
int32 result;
+ bool nextkey;
bool continuescan;
ScanKey scankeys = NULL;
ScanKey *startKeys = NULL;
@@ -663,10 +698,50 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
current = &(scan->currentItemData);
/*
+ * We want to locate either the first item >= boundary point, or
+ * first item > boundary point, depending on the initial-positioning
+ * strategy we just chose.
+ */
+ switch (strat_total)
+ {
+ case BTLessStrategyNumber:
+ nextkey = false;
+ break;
+
+ case BTLessEqualStrategyNumber:
+ nextkey = true;
+ break;
+
+ case BTEqualStrategyNumber:
+ /*
+ * If a backward scan was specified, need to start with last
+ * equal item not first one.
+ */
+ if (ScanDirectionIsBackward(dir))
+ nextkey = true;
+ else
+ nextkey = false;
+ break;
+
+ case BTGreaterEqualStrategyNumber:
+ nextkey = false;
+ break;
+
+ case BTGreaterStrategyNumber:
+ nextkey = true;
+ break;
+
+ default:
+ /* can't get here, but keep compiler quiet */
+ elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+ return false;
+ }
+
+ /*
* Use the manufactured scan key to descend the tree and position
* ourselves on the target leaf page.
*/
- stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ);
+ stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
/* don't need to keep the stack around... */
_bt_freestack(stack);
@@ -686,35 +761,45 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
page = BufferGetPage(buf);
/* position to the precise item on the page */
- offnum = _bt_binsrch(rel, buf, keysCount, scankeys);
+ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
ItemPointerSet(current, blkno, offnum);
/*
- * At this point we are positioned at the first item >= scan key, or
- * possibly at the end of a page on which all the existing items are
+ * It's now time to examine the initial-positioning strategy to find the
+ * exact place to start the scan.
+ *
+ * If nextkey = false, we are positioned at the first item >= scan key,
+ * or possibly at the end of a page on which all the existing items are
* less than the scan key and we know that everything on later pages
* is greater than or equal to scan key.
*
- * We could step forward in the latter case, but that'd be a waste of
- * time if we want to scan backwards. So, it's now time to examine
- * the initial-positioning strategy to find the exact place to start
- * the scan.
+ * If nextkey = true, we are positioned at the first item > scan key,
+ * or possibly at the end of a page on which all the existing items are
+ * less than or equal to the scan key and we know that everything on
+ * later pages is greater than scan key.
+ *
+ * The actually desired starting point is either this item or an adjacent
+ * one, or in the end-of-page case it's the last item on this page or
+ * the first item on the next. We apply _bt_step if needed to get to
+ * the right place.
*
* Note: if _bt_step fails (meaning we fell off the end of the index in
- * one direction or the other), we either return false (no matches) or
- * call _bt_endpoint() to set up a scan starting at that index
- * endpoint, as appropriate for the desired scan type.
+ * one direction or the other), then there are no matches so we just
+ * return false.
*
* it's yet other place to add some code later for is(not)null ...
*/
-
switch (strat_total)
{
case BTLessStrategyNumber:
/*
- * Back up one to arrive at last item < scankey
+ * We are on first item >= scankey.
+ *
+ * Back up one to arrive at last item < scankey. (Note: this
+ * positioning strategy is only used for a backward scan, so
+ * that is always the correct starting position.)
*/
if (!_bt_step(scan, &buf, BackwardScanDirection))
{
@@ -726,30 +811,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
case BTLessEqualStrategyNumber:
/*
- * We need to find the last item <= scankey, so step forward
- * till we find one > scankey, then step back one.
+ * We are on first item > scankey.
+ *
+ * Back up one to arrive at last item <= scankey. (Note: this
+ * positioning strategy is only used for a backward scan, so
+ * that is always the correct starting position.)
*/
- if (offnum > PageGetMaxOffsetNumber(page))
- {
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- {
- pfree(scankeys);
- return _bt_endpoint(scan, dir);
- }
- }
- for (;;)
- {
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, keysCount, scankeys, page, offnum);
- if (result < 0)
- break;
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- {
- pfree(scankeys);
- return _bt_endpoint(scan, dir);
- }
- }
if (!_bt_step(scan, &buf, BackwardScanDirection))
{
pfree(scankeys);
@@ -758,45 +825,49 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
break;
case BTEqualStrategyNumber:
-
/*
- * Make sure we are on the first equal item; might have to
- * step forward if currently at end of page.
+ * If a backward scan was specified, need to start with last
+ * equal item not first one.
*/
- if (offnum > PageGetMaxOffsetNumber(page))
+ if (ScanDirectionIsBackward(dir))
{
- if (!_bt_step(scan, &buf, ForwardScanDirection))
+ /*
+ * We are on first item > scankey.
+ *
+ * Back up one to arrive at last item <= scankey, then
+ * check to see if it is equal to scankey.
+ */
+ if (!_bt_step(scan, &buf, BackwardScanDirection))
{
pfree(scankeys);
return false;
}
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
}
- result = _bt_compare(rel, keysCount, scankeys, page, offnum);
- if (result != 0)
- goto nomatches; /* no equal items! */
-
- /*
- * If a backward scan was specified, need to start with last
- * equal item not first one.
- */
- if (ScanDirectionIsBackward(dir))
+ else
{
- do
+ /*
+ * We are on first item >= scankey.
+ *
+ * Make sure we are on a real item; might have to
+ * step forward if currently at end of page. Then check
+ * to see if it is equal to scankey.
+ */
+ if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
- return _bt_endpoint(scan, dir);
+ return false;
}
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, keysCount, scankeys, page, offnum);
- } while (result == 0);
- if (!_bt_step(scan, &buf, BackwardScanDirection))
- elog(ERROR, "equal items disappeared?");
+ }
}
+
+ /* If we are not now on an equal item, then there ain't any. */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, keysCount, scankeys, page, offnum);
+ if (result != 0)
+ goto nomatches; /* no equal items! */
break;
case BTGreaterEqualStrategyNumber:
@@ -818,8 +889,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
case BTGreaterStrategyNumber:
/*
- * We want the first item > scankey, so make sure we are on an
- * item and then step over any equal items.
+ * We want the first item > scankey, which is where we are...
+ * unless we're not anywhere at all...
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
@@ -828,20 +899,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
pfree(scankeys);
return false;
}
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
- result = _bt_compare(rel, keysCount, scankeys, page, offnum);
- while (result == 0)
- {
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- {
- pfree(scankeys);
- return false;
- }
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, keysCount, scankeys, page, offnum);
}
break;
}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6fc61a90999..62af3ca30e9 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.74 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.75 2003/12/21 01:23:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -459,12 +459,13 @@ extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
/*
* prototypes for functions in nbtsearch.c
*/
-extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey,
- Buffer *bufP, int access);
+extern BTStack _bt_search(Relation rel,
+ int keysz, ScanKey scankey, bool nextkey,
+ Buffer *bufP, int access);
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
- ScanKey scankey, int access);
+ ScanKey scankey, bool nextkey, int access);
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
- ScanKey scankey);
+ ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);