aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Geoghegan <pg@bowt.ie>2025-05-07 15:20:42 -0400
committerPeter Geoghegan <pg@bowt.ie>2025-05-07 15:20:42 -0400
commit5f4d98d4f3718cf7a0597c37f3ee4a4485de6ef8 (patch)
treeb635f978d9fb77102b8a7b67d249dc3e7c1f46c9
parent7e25c9363a82b6974c1ca2303ae8ded98af3bb24 (diff)
downloadpostgresql-5f4d98d4f3718cf7a0597c37f3ee4a4485de6ef8.tar.gz
postgresql-5f4d98d4f3718cf7a0597c37f3ee4a4485de6ef8.zip
Prevent premature nbtree array advancement.
nbtree array index scans could fail to return matching tuples in rare cases where the missed tuples cover key space that the scan's arrays incorrectly indicate has already been read. These cases involved nearby tuples with NULL values that were evaluated using a skip array key while in pstate.forcenonrequired mode. To fix, prevent forcenonrequired mode from prematurely advancing the scan's array keys beyond key space that the scan has yet to read tuples from: reset the scan's array keys (to the first elements in the current scan direction) before the _bt_checkkeys call for pstate.finaltup. That way _bt_checkkeys starts from a clean slate, which ensures that it will call _bt_advance_array_keys (while passing it sktrig_required=true). This reliably restores the invariant that the scan's arrays always accurately track its progress through the index's key space (at least when the scan is "between pages"). Oversight in commit 8a510275, which optimized nbtree search scan key comparisons. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Mark Dilger <mark.dilger@enterprisedb.com> Discussion: https://postgr.es/m/CAH2-WzmodSE+gpTd1CRGU9ez8ytyyDS+Kns2r9NzgUp1s56kpw@mail.gmail.com
-rw-r--r--src/backend/access/nbtree/nbtsearch.c8
-rw-r--r--src/backend/access/nbtree/nbtutils.c42
2 files changed, 39 insertions, 11 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 77264ddeecb..fe9a3886913 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1790,9 +1790,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
- truncatt = BTreeTupleGetNAtts(itup, rel);
+ /* Reset arrays, per _bt_set_startikey contract */
+ if (pstate.forcenonrequired)
+ _bt_start_array_keys(scan, dir);
pstate.forcenonrequired = false;
pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
+
+ truncatt = BTreeTupleGetNAtts(itup, rel);
_bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
}
@@ -1879,8 +1883,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.offnum = offnum;
if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
{
+ /* Reset arrays, per _bt_set_startikey contract */
pstate.forcenonrequired = false;
pstate.startikey = 0;
+ _bt_start_array_keys(scan, dir);
}
passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
itup, indnatts);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index c580c2bf527..1a15dfcb7d3 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2489,13 +2489,14 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
* primscan's first page would mislead _bt_advance_array_keys, which expects
* pstate.nskipadvances to be representative of every first page's key space.)
*
- * Caller must reset startikey and forcenonrequired ahead of the _bt_checkkeys
- * call for pstate.finaltup iff we set forcenonrequired=true. This will give
- * _bt_checkkeys the opportunity to call _bt_advance_array_keys once more,
- * with sktrig_required=true, to advance the arrays that were ignored during
- * checks of all of the page's prior tuples. Caller doesn't need to do this
- * on the rightmost/leftmost page in the index (where pstate.finaltup isn't
- * set), since forcenonrequired won't be set here by us in the first place.
+ * Caller must call _bt_start_array_keys and reset startikey/forcenonrequired
+ * ahead of the finaltup _bt_checkkeys call when we set forcenonrequired=true.
+ * This will give _bt_checkkeys the opportunity to call _bt_advance_array_keys
+ * with sktrig_required=true, restoring the invariant that the scan's required
+ * arrays always track the scan's progress through the index's key space.
+ * Caller won't need to do this on the rightmost/leftmost page in the index
+ * (where pstate.finaltup isn't ever set), since forcenonrequired will never
+ * be set here in the first place.
*/
void
_bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
@@ -2556,10 +2557,31 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
if (key->sk_flags & SK_ROW_HEADER)
{
/*
- * Can't let pstate.startikey get set to an ikey beyond a
- * RowCompare inequality
+ * RowCompare inequality.
+ *
+ * Only the first subkey from a RowCompare can ever be marked
+ * required (that happens when the row header is marked required).
+ * There is no simple, general way for us to transitively deduce
+ * whether or not every tuple on the page satisfies a RowCompare
+ * key based only on firsttup and lasttup -- so we just give up.
*/
- break; /* unsafe */
+ if (!start_past_saop_eq && !so->skipScan)
+ break; /* unsafe to go further */
+
+ /*
+ * We have to be even more careful with RowCompares that come
+ * after an array: we assume it's unsafe to even bypass the array.
+ * Calling _bt_start_array_keys to recover the scan's arrays
+ * following use of forcenonrequired mode isn't compatible with
+ * _bt_check_rowcompare's continuescan=false behavior with NULL
+ * row compare members. _bt_advance_array_keys must not make a
+ * decision on the basis of a key not being satisfied in the
+ * opposite-to-scan direction until the scan reaches a leaf page
+ * where the same key begins to be satisfied in scan direction.
+ * The _bt_first !used_all_subkeys behavior makes this limitation
+ * hard to work around some other way.
+ */
+ return; /* completely unsafe to set pstate.startikey */
}
if (key->sk_strategy != BTEqualStrategyNumber)
{