diff options
Diffstat (limited to 'src/backend/access/heap/heapam_handler.c')
-rw-r--r-- | src/backend/access/heap/heapam_handler.c | 749 |
1 files changed, 749 insertions, 0 deletions
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 0e1a1fe7b6f..1e4394a665b 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -19,11 +19,19 @@ */ #include "postgres.h" +#include "miscadmin.h" + +#include "access/genam.h" #include "access/heapam.h" #include "access/tableam.h" #include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "storage/bufmgr.h" +#include "storage/bufpage.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "utils/builtins.h" @@ -511,6 +519,744 @@ tuple_lock_retry: /* ------------------------------------------------------------------------ + * DDL related callbacks for heap AM. + * ------------------------------------------------------------------------ + */ + +static double +heapam_index_build_range_scan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + bool allow_sync, + bool anyvisible, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan) +{ + HeapScanDesc hscan; + bool is_system_catalog; + bool checking_uniqueness; + HeapTuple heapTuple; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + double reltuples; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Snapshot snapshot; + bool need_unregister_snapshot = false; + TransactionId OldestXmin; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(heapRelation); + + /* See whether we're verifying uniqueness/exclusion properties */ + checking_uniqueness = (indexInfo->ii_Unique || + indexInfo->ii_ExclusionOps != NULL); + + /* + * "Any visible" mode is not compatible with uniqueness checks; make sure + * only one of those is requested. + */ + Assert(!(anyvisible && checking_uniqueness)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = table_slot_create(heapRelation, NULL); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. In a normal index build, we use + * SnapshotAny because we must retrieve all tuples and do our own time + * qual checks (because we have to index RECENTLY_DEAD tuples). In a + * concurrent build, or during bootstrap, we take a regular MVCC snapshot + * and index whatever's live according to that. + */ + OldestXmin = InvalidTransactionId; + + /* okay to ignore lazy VACUUMs here */ + if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) + OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM); + + if (!scan) + { + /* + * Serial index build. + * + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + */ + if (!TransactionIdIsValid(OldestXmin)) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + need_unregister_snapshot = true; + } + else + snapshot = SnapshotAny; + + scan = table_beginscan_strat(heapRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + allow_sync); /* syncscan OK? */ + } + else + { + /* + * Parallel index build. + * + * Parallel case never registers/unregisters own snapshot. Snapshot + * is taken from parallel heap scan, and is SnapshotAny or an MVCC + * snapshot, based on same criteria as serial case. + */ + Assert(!IsBootstrapProcessingMode()); + Assert(allow_sync); + snapshot = scan->rs_snapshot; + } + + hscan = (HeapScanDesc) scan; + + /* + * Must call GetOldestXmin() with SnapshotAny. Should never call + * GetOldestXmin() with MVCC snapshot. (It's especially worth checking + * this for parallel builds, since ambuild routines that support parallel + * builds must work these details out for themselves.) + */ + Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); + Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : + !TransactionIdIsValid(OldestXmin)); + Assert(snapshot == SnapshotAny || !anyvisible); + + /* set our scan endpoints */ + if (!allow_sync) + heap_setscanlimits(scan, start_blockno, numblocks); + else + { + /* syncscan can only be requested on whole relation */ + Assert(start_blockno == 0); + Assert(numblocks == InvalidBlockNumber); + } + + reltuples = 0; + + /* + * Scan all tuples in the base relation. + */ + while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + bool tupleIsAlive; + + CHECK_FOR_INTERRUPTS(); + + /* + * When dealing with a HOT-chain of updated tuples, we want to index + * the values of the live tuple (if any), but index it under the TID + * of the chain's root tuple. This approach is necessary to preserve + * the HOT-chain structure in the heap. So we need to be able to find + * the root item offset for every tuple that's in a HOT-chain. When + * first reaching a new page of the relation, call + * heap_get_root_tuples() to build a map of root item offsets on the + * page. + * + * It might look unsafe to use this information across buffer + * lock/unlock. However, we hold ShareLock on the table so no + * ordinary insert/update/delete should occur; and we hold pin on the + * buffer continuously while visiting the page, so no pruning + * operation can occur either. + * + * Also, although our opinions about tuple liveness could change while + * we scan the page (due to concurrent transaction commits/aborts), + * the chain root locations won't, so this info doesn't need to be + * rebuilt after waiting for another transaction. + * + * Note the implied assumption that there is no more than one live + * tuple per HOT-chain --- else we could create more than one index + * entry pointing to the same root tuple. + */ + if (hscan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + root_blkno = hscan->rs_cblock; + } + + if (snapshot == SnapshotAny) + { + /* do our own time qual check */ + bool indexIt; + TransactionId xwait; + + recheck: + + /* + * We could possibly get away with not locking the buffer here, + * since caller should hold ShareLock on the relation, but let's + * be conservative about it. (This remark is still correct even + * with HOT-pruning: our pin on the buffer prevents pruning.) + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise + * CREATE INDEX and ANALYZE may produce wildly different reltuples + * values, e.g. when there are many recently-dead tuples. + */ + switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin, + hscan->rs_cbuf)) + { + case HEAPTUPLE_DEAD: + /* Definitely dead, we can ignore it */ + indexIt = false; + tupleIsAlive = false; + break; + case HEAPTUPLE_LIVE: + /* Normal case, index and unique-check it */ + indexIt = true; + tupleIsAlive = true; + /* Count it as live, too */ + reltuples += 1; + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must index it + * anyway to preserve MVCC semantics. (Pre-existing + * transactions could try to use the index after we finish + * building it, and may need to see such tuples.) + * + * However, if it was HOT-updated then we must only index + * the live tuple at the end of the HOT-chain. Since this + * breaks semantics for pre-existing snapshots, mark the + * index as unusable for them. + * + * We don't count recently-dead tuples in reltuples, even + * if we index them; see acquire_sample_rows(). + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else + indexIt = true; + /* In any case, exclude the tuple from unique-checking */ + tupleIsAlive = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * In "anyvisible" mode, this tuple is visible and we + * don't need any further checks. + */ + if (anyvisible) + { + indexIt = true; + tupleIsAlive = true; + reltuples += 1; + break; + } + + /* + * Since caller should hold ShareLock or better, normally + * the only way to see this is if it was inserted earlier + * in our own transaction. However, it can happen in + * system catalogs, since we tend to release write lock + * before commit there. Give a warning if neither case + * applies. + */ + xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + if (!is_system_catalog) + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(heapRelation)); + + /* + * If we are performing uniqueness checks, indexing + * such a tuple could lead to a bogus uniqueness + * failure. In that case we wait for the inserting + * transaction to finish and check again. + */ + if (checking_uniqueness) + { + /* + * Must drop the lock on the buffer before we wait + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait, heapRelation, + &heapTuple->t_self, + XLTW_InsertIndexUnique); + CHECK_FOR_INTERRUPTS(); + goto recheck; + } + } + else + { + /* + * For consistency with acquire_sample_rows(), count + * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only + * when inserted by our own transaction. + */ + reltuples += 1; + } + + /* + * We must index such tuples, since if the index build + * commits then they're good. + */ + indexIt = true; + tupleIsAlive = true; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * As with INSERT_IN_PROGRESS case, this is unexpected + * unless it's our own deletion or a system catalog; but + * in anyvisible mode, this tuple is visible. + */ + if (anyvisible) + { + indexIt = true; + tupleIsAlive = false; + reltuples += 1; + break; + } + + xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + if (!is_system_catalog) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(heapRelation)); + + /* + * If we are performing uniqueness checks, assuming + * the tuple is dead could lead to missing a + * uniqueness violation. In that case we wait for the + * deleting transaction to finish and check again. + * + * Also, if it's a HOT-updated tuple, we should not + * index it but rather the live tuple at the end of + * the HOT-chain. However, the deleting transaction + * could abort, possibly leaving this tuple as live + * after all, in which case it has to be indexed. The + * only way to know what to do is to wait for the + * deleting transaction to finish and check again. + */ + if (checking_uniqueness || + HeapTupleIsHotUpdated(heapTuple)) + { + /* + * Must drop the lock on the buffer before we wait + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait, heapRelation, + &heapTuple->t_self, + XLTW_InsertIndexUnique); + CHECK_FOR_INTERRUPTS(); + goto recheck; + } + + /* + * Otherwise index it but don't check for uniqueness, + * the same as a RECENTLY_DEAD tuple. + */ + indexIt = true; + + /* + * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live, + * if they were not deleted by the current + * transaction. That's what acquire_sample_rows() + * does, and we want the behavior to be consistent. + */ + reltuples += 1; + } + else if (HeapTupleIsHotUpdated(heapTuple)) + { + /* + * It's a HOT-updated tuple deleted by our own xact. + * We can assume the deletion will commit (else the + * index contents don't matter), so treat the same as + * RECENTLY_DEAD HOT-updated tuples. + */ + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else + { + /* + * It's a regular tuple deleted by our own xact. Index + * it, but don't check for uniqueness nor count in + * reltuples, the same as a RECENTLY_DEAD tuple. + */ + indexIt = true; + } + /* In any case, exclude the tuple from unique-checking */ + tupleIsAlive = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + indexIt = tupleIsAlive = false; /* keep compiler quiet */ + break; + } + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + if (!indexIt) + continue; + } + else + { + /* heap_getnext did the time qual check */ + tupleIsAlive = true; + reltuples += 1; + } + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* Set up for predicate or expression evaluation */ + ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use in + * this index, and note which are null. This also performs evaluation + * of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* + * You'd think we should go ahead and build the index tuple here, but + * some index AMs want to do further processing on the data first. So + * pass the values[] and isnull[] arrays, instead. + */ + + if (HeapTupleIsHeapOnly(heapTuple)) + { + /* + * For a heap-only tuple, pretend its TID is that of the root. See + * src/backend/access/heap/README.HOT for discussion. + */ + HeapTupleData rootTuple; + OffsetNumber offnum; + + rootTuple = *heapTuple; + offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); + + if (!OffsetNumberIsValid(root_offsets[offnum - 1])) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", + ItemPointerGetBlockNumber(&heapTuple->t_self), + offnum, + RelationGetRelationName(heapRelation)))); + + ItemPointerSetOffsetNumber(&rootTuple.t_self, + root_offsets[offnum - 1]); + + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive, + callback_state); + } + else + { + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, heapTuple, values, isnull, tupleIsAlive, + callback_state); + } + } + + table_endscan(scan); + + /* we can now forget our snapshot, if set and registered by us */ + if (need_unregister_snapshot) + UnregisterSnapshot(snapshot); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; + + return reltuples; +} + +static void +heapam_index_validate_scan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + Snapshot snapshot, + ValidateIndexState * state) +{ + TableScanDesc scan; + HeapScanDesc hscan; + HeapTuple heapTuple; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + bool in_index[MaxHeapTuplesPerPage]; + + /* state variables for the merge */ + ItemPointer indexcursor = NULL; + ItemPointerData decoded; + bool tuplesort_empty = false; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. We need just those tuples + * satisfying the passed-in reference snapshot. We must disable syncscan + * here, because it's critical that we read from block zero forward to + * match the sorted TIDs. + */ + scan = table_beginscan_strat(heapRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + false); /* syncscan not OK */ + hscan = (HeapScanDesc) scan; + + /* + * Scan all tuples matching the snapshot. + */ + while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + ItemPointer heapcursor = &heapTuple->t_self; + ItemPointerData rootTuple; + OffsetNumber root_offnum; + + CHECK_FOR_INTERRUPTS(); + + state->htups += 1; + + /* + * As commented in table_index_build_scan, we should index heap-only + * tuples under the TIDs of their root tuples; so when we advance onto + * a new heap page, build a map of root item offsets on the page. + * + * This complicates merging against the tuplesort output: we will + * visit the live tuples in order by their offsets, but the root + * offsets that we need to compare against the index contents might be + * ordered differently. So we might have to "look back" within the + * tuplesort output, but only within the current page. We handle that + * by keeping a bool array in_index[] showing all the + * already-passed-over tuplesort output TIDs of the current page. We + * clear that array here, when advancing onto a new heap page. + */ + if (hscan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + memset(in_index, 0, sizeof(in_index)); + + root_blkno = hscan->rs_cblock; + } + + /* Convert actual tuple TID to root TID */ + rootTuple = *heapcursor; + root_offnum = ItemPointerGetOffsetNumber(heapcursor); + + if (HeapTupleIsHeapOnly(heapTuple)) + { + root_offnum = root_offsets[root_offnum - 1]; + if (!OffsetNumberIsValid(root_offnum)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", + ItemPointerGetBlockNumber(heapcursor), + ItemPointerGetOffsetNumber(heapcursor), + RelationGetRelationName(heapRelation)))); + ItemPointerSetOffsetNumber(&rootTuple, root_offnum); + } + + /* + * "merge" by skipping through the index tuples until we find or pass + * the current root tuple. + */ + while (!tuplesort_empty && + (!indexcursor || + ItemPointerCompare(indexcursor, &rootTuple) < 0)) + { + Datum ts_val; + bool ts_isnull; + + if (indexcursor) + { + /* + * Remember index items seen earlier on the current heap page + */ + if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) + in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; + } + + tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, + &ts_val, &ts_isnull, NULL); + Assert(tuplesort_empty || !ts_isnull); + if (!tuplesort_empty) + { + itemptr_decode(&decoded, DatumGetInt64(ts_val)); + indexcursor = &decoded; + + /* If int8 is pass-by-ref, free (encoded) TID Datum memory */ +#ifndef USE_FLOAT8_BYVAL + pfree(DatumGetPointer(ts_val)); +#endif + } + else + { + /* Be tidy */ + indexcursor = NULL; + } + } + + /* + * If the tuplesort has overshot *and* we didn't see a match earlier, + * then this tuple is missing from the index, so insert it. + */ + if ((tuplesort_empty || + ItemPointerCompare(indexcursor, &rootTuple) > 0) && + !in_index[root_offnum - 1]) + { + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* Set up for predicate or expression evaluation */ + ExecStoreHeapTuple(heapTuple, slot, false); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use + * in this index, and note which are null. This also performs + * evaluation of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* + * You'd think we should go ahead and build the index tuple here, + * but some index AMs want to do further processing on the data + * first. So pass the values[] and isnull[] arrays, instead. + */ + + /* + * If the tuple is already committed dead, you might think we + * could suppress uniqueness checking, but this is no longer true + * in the presence of HOT, because the insert is actually a proxy + * for a uniqueness check on the whole HOT-chain. That is, the + * tuple we have here could be dead because it was already + * HOT-updated, and if so the updating transaction will not have + * thought it should insert index entries. The index AM will + * check the whole HOT-chain and correctly detect a conflict if + * there is one. + */ + + index_insert(indexRelation, + values, + isnull, + &rootTuple, + heapRelation, + indexInfo->ii_Unique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + indexInfo); + + state->tups_inserted += 1; + } + } + + table_endscan(scan); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; +} + + +/* ------------------------------------------------------------------------ * Definition of the heap table access method. * ------------------------------------------------------------------------ */ @@ -545,6 +1291,9 @@ static const TableAmRoutine heapam_methods = { .tuple_get_latest_tid = heap_get_latest_tid, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples, + + .index_build_range_scan = heapam_index_build_range_scan, + .index_validate_scan = heapam_index_validate_scan, }; |