aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtsearch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtsearch.c')
-rw-r--r--src/backend/access/nbtree/nbtsearch.c106
1 files changed, 97 insertions, 9 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 5a5c30abc3a..5643dd4d884 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -152,8 +152,12 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
* downlink (block) to uniquely identify the index entry, in case it
* moves right while we're working lower in the tree. See the paper
* by Lehman and Yao for how this is detected and handled. (We use the
- * child link to disambiguate duplicate keys in the index -- Lehman
- * and Yao disallow duplicate keys.)
+ * child link during the second half of a page split -- if caller ends
+ * up splitting the child it usually ends up inserting a new pivot
+ * tuple for child's new right sibling immediately after the original
+ * bts_offset offset recorded here. The downlink block will be needed
+ * to check if bts_offset remains the position of this same pivot
+ * tuple.)
*/
new_stack = (BTStack) palloc(sizeof(BTStackData));
new_stack->bts_blkno = par_blkno;
@@ -251,11 +255,13 @@ _bt_moveright(Relation rel,
/*
* When nextkey = false (normal case): if the scan key that brought us to
* this page is > the high key stored on the page, then the page has split
- * and we need to move right. (If the scan key is equal to the high key,
- * we might or might not need to move right; have to scan the page first
- * anyway.)
+ * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
+ * have some duplicates to the right as well as the left, but that's
+ * something that's only ever dealt with on the leaf level, after
+ * _bt_search has found an initial leaf page.)
*
* When nextkey = true: move right if the scan key is >= page's high key.
+ * (Note that key.scantid cannot be set in this case.)
*
* The page could even have split more than once, so scan as far as
* needed.
@@ -347,6 +353,9 @@ _bt_binsrch(Relation rel,
int32 result,
cmpval;
+ /* Requesting nextkey semantics while using scantid seems nonsensical */
+ Assert(!key->nextkey || key->scantid == NULL);
+
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -554,10 +563,14 @@ _bt_compare(Relation rel,
TupleDesc itupdesc = RelationGetDescr(rel);
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTuple itup;
+ ItemPointer heapTid;
ScanKey scankey;
+ int ncmpkey;
+ int ntupatts;
- Assert(_bt_check_natts(rel, page, offnum));
+ Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(key->heapkeyspace || key->scantid == NULL);
/*
* Force result ">" if target item is first data item on an internal page
@@ -567,6 +580,7 @@ _bt_compare(Relation rel,
return 1;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ ntupatts = BTreeTupleGetNAtts(itup, rel);
/*
* The scan key is set up with the attribute number associated with each
@@ -580,8 +594,10 @@ _bt_compare(Relation rel,
* _bt_first).
*/
+ ncmpkey = Min(ntupatts, key->keysz);
+ Assert(key->heapkeyspace || ncmpkey == key->keysz);
scankey = key->scankeys;
- for (int i = 1; i <= key->keysz; i++)
+ for (int i = 1; i <= ncmpkey; i++)
{
Datum datum;
bool isNull;
@@ -632,8 +648,77 @@ _bt_compare(Relation rel,
scankey++;
}
- /* if we get here, the keys are equal */
- return 0;
+ /*
+ * All non-truncated attributes (other than heap TID) were found to be
+ * equal. Treat truncated attributes as minus infinity when scankey has a
+ * key attribute value that would otherwise be compared directly.
+ *
+ * Note: it doesn't matter if ntupatts includes non-key attributes;
+ * scankey won't, so explicitly excluding non-key attributes isn't
+ * necessary.
+ */
+ if (key->keysz > ntupatts)
+ return 1;
+
+ /*
+ * Use the heap TID attribute and scantid to try to break the tie. The
+ * rules are the same as any other key attribute -- only the
+ * representation differs.
+ */
+ heapTid = BTreeTupleGetHeapTID(itup);
+ if (key->scantid == NULL)
+ {
+ /*
+ * Most searches have a scankey that is considered greater than a
+ * truncated pivot tuple if and when the scankey has equal values for
+ * attributes up to and including the least significant untruncated
+ * attribute in tuple.
+ *
+ * For example, if an index has the minimum two attributes (single
+ * user key attribute, plus heap TID attribute), and a page's high key
+ * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
+ * will not descend to the page to the left. The search will descend
+ * right instead. The truncated attribute in pivot tuple means that
+ * all non-pivot tuples on the page to the left are strictly < 'foo',
+ * so it isn't necessary to descend left. In other words, search
+ * doesn't have to descend left because it isn't interested in a match
+ * that has a heap TID value of -inf.
+ *
+ * However, some searches (pivotsearch searches) actually require that
+ * we descend left when this happens. -inf is treated as a possible
+ * match for omitted scankey attribute(s). This is needed by page
+ * deletion, which must re-find leaf pages that are targets for
+ * deletion using their high keys.
+ *
+ * Note: the heap TID part of the test ensures that scankey is being
+ * compared to a pivot tuple with one or more truncated key
+ * attributes.
+ *
+ * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
+ * left here, since they have no heap TID attribute (and cannot have
+ * any -inf key values in any case, since truncation can only remove
+ * non-key attributes). !heapkeyspace searches must always be
+ * prepared to deal with matches on both sides of the pivot once the
+ * leaf level is reached.
+ */
+ if (key->heapkeyspace && !key->pivotsearch &&
+ key->keysz == ntupatts && heapTid == NULL)
+ return 1;
+
+ /* All provided scankey arguments found to be equal */
+ return 0;
+ }
+
+ /*
+ * Treat truncated heap TID as minus infinity, since scankey has a key
+ * attribute value (scantid) that would otherwise be compared directly
+ */
+ Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+ if (heapTid == NULL)
+ return 1;
+
+ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+ return ItemPointerCompare(key->scantid, heapTid);
}
/*
@@ -1148,7 +1233,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/* Initialize remaining insertion scan key fields */
+ inskey.heapkeyspace = _bt_heapkeyspace(rel);
inskey.nextkey = nextkey;
+ inskey.pivotsearch = false;
+ inskey.scantid = NULL;
inskey.keysz = keysCount;
/*