aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2011-03-01 19:05:16 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2011-03-01 19:05:16 +0200
commit47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6 (patch)
treeb8a4ac5fa3cff5235a5693ca31825ecbcfdd97db
parent16143d64513e4dc3c72bad7ae98d3df0b5a23013 (diff)
downloadpostgresql-47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6.tar.gz
postgresql-47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6.zip
Fix bugs in Serializable Snapshot Isolation.
Change the way UPDATEs are handled. Instead of maintaining a chain of tuple-level locks in shared memory, copy any existing locks on the old tuple to the new tuple at UPDATE. Any existing page-level lock needs to be duplicated too, as a lock on the new tuple. That was neglected previously. Store xmin on tuple-level predicate locks, to distinguish a lock on an old already-recycled tuple from a new tuple at the same physical location. Failure to distinguish them caused loops in the tuple-lock chains, as reported by YAMAMOTO Takashi. Although we don't use the chain representation of UPDATEs anymore, it seems like a good idea to store the xmin to avoid some false positives if no other reason. CheckSingleTargetForConflictsIn now correctly handles the case where a lock that's being held is not reflected in the local lock table. That happens if another backend acquires a lock on our behalf due to an UPDATE or a page split. PredicateLockPageCombine now retains locks for the page that is being removed, rather than removing them. This prevents a potentially dangerous false-positive inconsistency where the local lock table believes that a lock is held, but it is actually not. Dan Ports and Kevin Grittner
-rw-r--r--src/backend/access/nbtree/nbtree.c1
-rw-r--r--src/backend/storage/lmgr/predicate.c383
-rw-r--r--src/include/storage/lwlock.h1
-rw-r--r--src/include/storage/predicate_internals.h54
4 files changed, 148 insertions, 291 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index ba01874469b..7a0e1a9c25e 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -824,7 +824,6 @@ restart:
if (_bt_page_recyclable(page))
{
/* Okay to recycle this page */
- Assert(!PageIsPredicateLocked(rel, blkno));
RecordFreeIndexPage(rel, blkno);
vstate->totFreePages++;
stats->pages_deleted++;
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d660ce5af78..aa657fab579 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -124,10 +124,6 @@
* SerializableXactHashLock
* - Protects both PredXact and SerializableXidHash.
*
- * PredicateLockNextRowLinkLock
- * - Protects the priorVersionOfRow and nextVersionOfRow fields of
- * PREDICATELOCKTARGET when linkage is being created or destroyed.
- *
*
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -444,8 +440,6 @@ static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
bool summarize);
static bool XidIsConcurrent(TransactionId xid);
static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
-static bool CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
- PREDICATELOCKTARGETTAG *nexttargettag);
static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
SERIALIZABLEXACT *writer);
@@ -1044,7 +1038,6 @@ InitPredicateLocks(void)
PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
PredXact->CanPartialClearThrough = 0;
PredXact->HavePartialClearedThrough = 0;
- PredXact->NeedTargetLinkCleanup = false;
requestSize = mul_size((Size) max_table_size,
PredXactListElementDataSize);
PredXact->element = ShmemAlloc(requestSize);
@@ -1651,9 +1644,10 @@ PageIsPredicateLocked(const Relation relation, const BlockNumber blkno)
* Important note: this function may return false even if the lock is
* being held, because it uses the local lock table which is not
* updated if another transaction modifies our lock list (e.g. to
- * split an index page). However, it will never return true if the
- * lock is not held. We only use this function in circumstances where
- * such false negatives are acceptable.
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
*/
static bool
PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
@@ -1717,6 +1711,9 @@ GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
/*
* Check whether the lock we are considering is already covered by a
* coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
*/
static bool
CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
@@ -1747,7 +1744,6 @@ static void
RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
{
PREDICATELOCKTARGET *rmtarget;
- PREDICATELOCKTARGET *next;
Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
@@ -1755,33 +1751,6 @@ RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
if (!SHMQueueEmpty(&target->predicateLocks))
return;
- /* Can't remove it if there are locks for a prior row version. */
- LWLockAcquire(PredicateLockNextRowLinkLock, LW_EXCLUSIVE);
- if (target->priorVersionOfRow != NULL)
- {
- LWLockRelease(PredicateLockNextRowLinkLock);
- return;
- }
-
- /*
- * We are going to release this target, This requires that we let the
- * next version of the row (if any) know that it's previous version is
- * done.
- *
- * It might be that the link was all that was keeping the other target
- * from cleanup, but we can't clean that up here -- LW locking is all
- * wrong for that. We'll pass the HTAB in the general cleanup function to
- * get rid of such "dead" targets.
- */
- next = target->nextVersionOfRow;
- if (next != NULL)
- {
- next->priorVersionOfRow = NULL;
- if (SHMQueueEmpty(&next->predicateLocks))
- PredXact->NeedTargetLinkCleanup = true;
- }
- LWLockRelease(PredicateLockNextRowLinkLock);
-
/* Actually remove the target. */
rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
&target->tag,
@@ -2065,11 +2034,7 @@ CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
errmsg("out of shared memory"),
errhint("You might need to increase max_pred_locks_per_transaction.")));
if (!found)
- {
SHMQueueInit(&(target->predicateLocks));
- target->priorVersionOfRow = NULL;
- target->nextVersionOfRow = NULL;
- }
/* We've got the sxact and target, make sure they're joined. */
locktag.myTarget = target;
@@ -2125,8 +2090,6 @@ PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
hash_search_with_hash_value(LocalPredicateLockHash,
targettag, targettaghash,
HASH_ENTER, &found);
- /* We should not hold the lock (but its entry might still exist) */
- Assert(!found || !locallock->held);
locallock->held = true;
if (!found)
locallock->childLocks = 0;
@@ -2215,6 +2178,7 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
{
PREDICATELOCKTARGETTAG tag;
ItemPointer tid;
+ TransactionId targetxmin;
if (SkipSerialization(relation))
return;
@@ -2224,15 +2188,16 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
*/
if (relation->rd_index == NULL)
{
- TransactionId myxid = GetTopTransactionIdIfAny();
+ TransactionId myxid;
+
+ targetxmin = HeapTupleHeaderGetXmin(tuple->t_data);
+ myxid = GetTopTransactionIdIfAny();
if (TransactionIdIsValid(myxid))
{
- TransactionId xid = HeapTupleHeaderGetXmin(tuple->t_data);
-
- if (TransactionIdFollowsOrEquals(xid, TransactionXmin))
+ if (TransactionIdFollowsOrEquals(targetxmin, TransactionXmin))
{
- xid = SubTransGetTopmostTransaction(xid);
+ TransactionId xid = SubTransGetTopmostTransaction(targetxmin);
if (TransactionIdEquals(xid, myxid))
{
/* We wrote it; we already have a write lock. */
@@ -2241,6 +2206,8 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
}
}
}
+ else
+ targetxmin = InvalidTransactionId;
/*
* Do quick-but-not-definitive test for a relation lock first. This will
@@ -2259,116 +2226,78 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
relation->rd_node.dbNode,
relation->rd_id,
ItemPointerGetBlockNumber(tid),
- ItemPointerGetOffsetNumber(tid));
+ ItemPointerGetOffsetNumber(tid),
+ targetxmin);
PredicateLockAcquire(&tag);
}
/*
- * If the old tuple has any predicate locks, create a lock target for the
- * new tuple and point them at each other. Conflict detection needs to
- * look for locks against prior versions of the row.
+ * If the old tuple has any predicate locks, copy them to the new target.
+ *
+ * This is called at an UPDATE, where any predicate locks held on the old
+ * tuple need to be copied to the new tuple, because logically they both
+ * represent the same row. A lock taken before the update must conflict
+ * with anyone locking the same row after the update.
*/
void
PredicateLockTupleRowVersionLink(const Relation relation,
const HeapTuple oldTuple,
const HeapTuple newTuple)
{
- PREDICATELOCKTARGETTAG oldtargettag;
- PREDICATELOCKTARGETTAG newtargettag;
- PREDICATELOCKTARGET *oldtarget;
- PREDICATELOCKTARGET *newtarget;
- PREDICATELOCKTARGET *next;
- uint32 oldtargettaghash;
- LWLockId oldpartitionLock;
- uint32 newtargettaghash;
- LWLockId newpartitionLock;
- bool found;
-
- SET_PREDICATELOCKTARGETTAG_TUPLE(oldtargettag,
+ PREDICATELOCKTARGETTAG oldtupletag;
+ PREDICATELOCKTARGETTAG oldpagetag;
+ PREDICATELOCKTARGETTAG newtupletag;
+ BlockNumber oldblk,
+ newblk;
+ OffsetNumber oldoff,
+ newoff;
+ TransactionId oldxmin,
+ newxmin;
+
+ oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
+ oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
+ oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
+
+ newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
+ newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
+ newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
+
+ SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
relation->rd_node.dbNode,
relation->rd_id,
- ItemPointerGetBlockNumber(&(oldTuple->t_self)),
- ItemPointerGetOffsetNumber(&(oldTuple->t_self)));
- oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
- oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ oldblk,
+ oldoff,
+ oldxmin);
- SET_PREDICATELOCKTARGETTAG_TUPLE(newtargettag,
+ SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ oldblk);
+
+ SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
relation->rd_node.dbNode,
relation->rd_id,
- ItemPointerGetBlockNumber(&(newTuple->t_self)),
- ItemPointerGetOffsetNumber(&(newTuple->t_self)));
- newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
- newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
-
- /* Lock lower numbered partition first. */
- if (oldpartitionLock < newpartitionLock)
- {
- LWLockAcquire(oldpartitionLock, LW_SHARED);
- LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
- }
- else if (newpartitionLock < oldpartitionLock)
- {
- LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(oldpartitionLock, LW_SHARED);
- }
- else
- LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-
- oldtarget = (PREDICATELOCKTARGET *)
- hash_search_with_hash_value(PredicateLockTargetHash,
- &oldtargettag, oldtargettaghash,
- HASH_FIND, NULL);
-
- /* Only need to link if there is an old target already. */
- if (oldtarget)
- {
- LWLockAcquire(PredicateLockNextRowLinkLock, LW_EXCLUSIVE);
-
- /* Guard against stale pointers from rollback. */
- next = oldtarget->nextVersionOfRow;
- if (next != NULL)
- {
- next->priorVersionOfRow = NULL;
- oldtarget->nextVersionOfRow = NULL;
- }
-
- /* Find or create the new target, and link old and new. */
- newtarget = (PREDICATELOCKTARGET *)
- hash_search_with_hash_value(PredicateLockTargetHash,
- &newtargettag, newtargettaghash,
- HASH_ENTER, &found);
- if (!newtarget)
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of shared memory"),
- errhint("You might need to increase max_pred_locks_per_transaction.")));
- if (!found)
- {
- SHMQueueInit(&(newtarget->predicateLocks));
- newtarget->nextVersionOfRow = NULL;
- }
- else
- Assert(newtarget->priorVersionOfRow == NULL);
+ newblk,
+ newoff,
+ newxmin);
- newtarget->priorVersionOfRow = oldtarget;
- oldtarget->nextVersionOfRow = newtarget;
+ /*
+ * A page-level lock on the page containing the old tuple counts too.
+ * Anyone holding a lock on the page is logically holding a lock on
+ * the old tuple, so we need to acquire a lock on his behalf on the
+ * new tuple too. However, if the new tuple is on the same page as the
+ * old one, the old page-level lock already covers the new tuple.
+ *
+ * A relation-level lock always covers both tuple versions, so we don't
+ * need to worry about those here.
+ */
+ LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
- LWLockRelease(PredicateLockNextRowLinkLock);
- }
+ TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
+ if (newblk != oldblk)
+ TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
- /* Release lower number partition last. */
- if (oldpartitionLock < newpartitionLock)
- {
- LWLockRelease(newpartitionLock);
- LWLockRelease(oldpartitionLock);
- }
- else if (newpartitionLock < oldpartitionLock)
- {
- LWLockRelease(oldpartitionLock);
- LWLockRelease(newpartitionLock);
- }
- else
- LWLockRelease(newpartitionLock);
+ LWLockRelease(SerializablePredicateLockListLock);
}
@@ -2437,6 +2366,17 @@ DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
* removeOld is set (by using the reserved entry in
* PredicateLockTargetHash for scratch space).
*
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
* Caller must hold SerializablePredicateLockListLock.
*/
static bool
@@ -2533,11 +2473,7 @@ TransferPredicateLocksToNewTarget(const PREDICATELOCKTARGETTAG oldtargettag,
/* If we created a new entry, initialize it */
if (!found)
- {
SHMQueueInit(&(newtarget->predicateLocks));
- newtarget->priorVersionOfRow = NULL;
- newtarget->nextVersionOfRow = NULL;
- }
newpredlocktag.myTarget = newtarget;
@@ -2704,7 +2640,14 @@ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
&newtargettag);
Assert(success);
- /* Move the locks to the parent. This shouldn't fail. */
+ /*
+ * Move the locks to the parent. This shouldn't fail.
+ *
+ * Note that here we are removing locks held by other
+ * backends, leading to a possible inconsistency in their
+ * local lock hash table. This is OK because we're replacing
+ * it with a lock that covers the old one.
+ */
success = TransferPredicateLocksToNewTarget(oldtargettag,
newtargettag,
true);
@@ -2727,36 +2670,19 @@ void
PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno,
const BlockNumber newblkno)
{
- PREDICATELOCKTARGETTAG oldtargettag;
- PREDICATELOCKTARGETTAG newtargettag;
- bool success;
-
-
- if (SkipSplitTracking(relation))
- return;
-
- Assert(oldblkno != newblkno);
- Assert(BlockNumberIsValid(oldblkno));
- Assert(BlockNumberIsValid(newblkno));
-
- SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
- relation->rd_node.dbNode,
- relation->rd_id,
- oldblkno);
- SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
- relation->rd_node.dbNode,
- relation->rd_id,
- newblkno);
-
- LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
-
- /* Move the locks. This shouldn't fail. */
- success = TransferPredicateLocksToNewTarget(oldtargettag,
- newtargettag,
- true);
- Assert(success);
-
- LWLockRelease(SerializablePredicateLockListLock);
+ /*
+ * Page combines differ from page splits in that we ought to be
+ * able to remove the locks on the old page after transferring
+ * them to the new page, instead of duplicating them. However,
+ * because we can't edit other backends' local lock tables,
+ * removing the old lock would leave them with an entry in their
+ * LocalPredicateLockHash for a lock they're not holding, which
+ * isn't acceptable. So we wind up having to do the same work as a
+ * page split, acquiring a lock on the new page and keeping the old
+ * page locked too. That can lead to some false positives, but
+ * should be rare in practice.
+ */
+ PredicateLockPageSplit(relation, oldblkno, newblkno);
}
/*
@@ -3132,9 +3058,6 @@ ClearOldPredicateLocks(void)
{
SERIALIZABLEXACT *finishedSxact;
PREDICATELOCK *predlock;
- int i;
- HASH_SEQ_STATUS seqstat;
- PREDICATELOCKTARGET *locktarget;
LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
finishedSxact = (SERIALIZABLEXACT *)
@@ -3232,35 +3155,6 @@ ClearOldPredicateLocks(void)
LWLockRelease(SerializablePredicateLockListLock);
LWLockRelease(SerializableFinishedListLock);
-
- if (!PredXact->NeedTargetLinkCleanup)
- return;
-
- /*
- * Clean up any targets which were disconnected from a prior version with
- * no predicate locks attached.
- */
- for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
- LWLockAcquire(FirstPredicateLockMgrLock + i, LW_EXCLUSIVE);
- LWLockAcquire(PredicateLockNextRowLinkLock, LW_SHARED);
-
- hash_seq_init(&seqstat, PredicateLockTargetHash);
- while ((locktarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
- {
- if (SHMQueueEmpty(&locktarget->predicateLocks)
- && locktarget->priorVersionOfRow == NULL
- && locktarget->nextVersionOfRow == NULL)
- {
- hash_search(PredicateLockTargetHash, &locktarget->tag,
- HASH_REMOVE, NULL);
- }
- }
-
- PredXact->NeedTargetLinkCleanup = false;
-
- LWLockRelease(PredicateLockNextRowLinkLock);
- for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
- LWLockRelease(FirstPredicateLockMgrLock + i);
}
/*
@@ -3676,38 +3570,15 @@ CheckForSerializableConflictOut(const bool visible, const Relation relation,
}
/*
- * Check a particular target for rw-dependency conflict in. This will
- * also check prior versions of a tuple, if any.
+ * Check a particular target for rw-dependency conflict in.
*/
static void
CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
{
- PREDICATELOCKTARGETTAG nexttargettag = { 0 };
- PREDICATELOCKTARGETTAG thistargettag;
-
- for (;;)
- {
- if (!CheckSingleTargetForConflictsIn(targettag, &nexttargettag))
- break;
- thistargettag = nexttargettag;
- targettag = &thistargettag;
- }
-}
-
-/*
- * Check a particular target for rw-dependency conflict in. If the tuple
- * has prior versions, returns true and *nexttargettag is set to the tag
- * of the prior tuple version.
- */
-static bool
-CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
- PREDICATELOCKTARGETTAG *nexttargettag)
-{
uint32 targettaghash;
LWLockId partitionLock;
PREDICATELOCKTARGET *target;
PREDICATELOCK *predlock;
- bool hasnexttarget = false;
Assert(MySerializableXact != InvalidSerializableXact);
@@ -3717,7 +3588,6 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
targettaghash = PredicateLockTargetTagHashCode(targettag);
partitionLock = PredicateLockHashPartitionLock(targettaghash);
LWLockAcquire(partitionLock, LW_SHARED);
- LWLockAcquire(PredicateLockNextRowLinkLock, LW_SHARED);
target = (PREDICATELOCKTARGET *)
hash_search_with_hash_value(PredicateLockTargetHash,
targettag, targettaghash,
@@ -3725,21 +3595,9 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
if (!target)
{
/* Nothing has this target locked; we're done here. */
- LWLockRelease(PredicateLockNextRowLinkLock);
LWLockRelease(partitionLock);
- return false;
- }
-
- /*
- * If the target is linked to a prior version of the row, save the tag so
- * that it can be used for iterative calls to this function.
- */
- if (target->priorVersionOfRow != NULL)
- {
- *nexttargettag = target->priorVersionOfRow->tag;
- hasnexttarget = true;
+ return;
}
- LWLockRelease(PredicateLockNextRowLinkLock);
/*
* Each lock for an overlapping transaction represents a conflict: a
@@ -3828,17 +3686,25 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
hash_search_with_hash_value(LocalPredicateLockHash,
targettag, targettaghash,
HASH_FIND, NULL);
- Assert(locallock != NULL);
- Assert(locallock->held);
- locallock->held = false;
- if (locallock->childLocks == 0)
+ /*
+ * Remove entry in local lock table if it exists and has
+ * no children. It's OK if it doesn't exist; that means
+ * the lock was transferred to a new target by a
+ * different backend.
+ */
+ if (locallock != NULL)
{
- rmlocallock = (LOCALPREDICATELOCK *)
- hash_search_with_hash_value(LocalPredicateLockHash,
- targettag, targettaghash,
- HASH_REMOVE, NULL);
- Assert(rmlocallock == locallock);
+ locallock->held = false;
+
+ if (locallock->childLocks == 0)
+ {
+ rmlocallock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_REMOVE, NULL);
+ Assert(rmlocallock == locallock);
+ }
}
DecrementParentLocks(targettag);
@@ -3848,7 +3714,7 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
* the target, bail out before re-acquiring the locks.
*/
if (rmtarget)
- return hasnexttarget;
+ return;
/*
* The list has been altered. Start over at the front.
@@ -3895,8 +3761,6 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
}
LWLockRelease(SerializableXactHashLock);
LWLockRelease(partitionLock);
-
- return hasnexttarget;
}
/*
@@ -3943,7 +3807,8 @@ CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple,
relation->rd_node.dbNode,
relation->rd_id,
ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)),
- ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)));
+ ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)),
+ HeapTupleHeaderGetXmin(tuple->t_data));
CheckTargetForConflictsIn(&targettag);
}
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 99fe37ef83b..ad0bcd775b0 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -78,7 +78,6 @@ typedef enum LWLockId
SerializableFinishedListLock,
SerializablePredicateLockListLock,
OldSerXidLock,
- PredicateLockNextRowLinkLock,
/* Individual lock IDs end here */
FirstBufMappingLock,
FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h
index 32e9a1bc218..05561256367 100644
--- a/src/include/storage/predicate_internals.h
+++ b/src/include/storage/predicate_internals.h
@@ -150,8 +150,6 @@ typedef struct PredXactListData
SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
* seq no */
SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
- bool NeedTargetLinkCleanup; /* to save cleanup effort for rare
- * case */
PredXactListElement element;
} PredXactListData;
@@ -231,9 +229,13 @@ typedef struct SERIALIZABLEXID
/*
* The PREDICATELOCKTARGETTAG struct identifies a database object which can
- * be the target of predicate locks. It is designed to fit into 16 bytes
- * with no padding. Note that this would need adjustment if we widen Oid or
- * BlockNumber to more than 32 bits.
+ * be the target of predicate locks.
+ *
+ * Note that the hash function being used doesn't properly respect tag
+ * length -- it will go to a four byte boundary past the end of the tag.
+ * If you change this struct, make sure any slack space is initialized,
+ * so that any random bytes in the middle or at the end are not included
+ * in the hash.
*
* TODO SSI: If we always use the same fields for the same type of value, we
* should rename these. Holding off until it's clear there are no exceptions.
@@ -247,8 +249,8 @@ typedef struct PREDICATELOCKTARGETTAG
uint32 locktag_field1; /* a 32-bit ID field */
uint32 locktag_field2; /* a 32-bit ID field */
uint32 locktag_field3; /* a 32-bit ID field */
- uint16 locktag_field4; /* a 16-bit ID field */
- uint16 locktag_field5; /* a 16-bit ID field */
+ uint32 locktag_field4; /* a 32-bit ID field */
+ uint32 locktag_field5; /* a 32-bit ID field */
} PREDICATELOCKTARGETTAG;
/*
@@ -260,12 +262,11 @@ typedef struct PREDICATELOCKTARGETTAG
* already have one. An entry is removed when the last lock is removed from
* its list.
*
- * Because a check for predicate locks on a tuple target should also find
- * locks on previous versions of the same row, if there are any created by
- * overlapping transactions, we keep a pointer to the target for the prior
- * version of the row. We also keep a pointer to the next version of the
- * row, so that when we no longer have any predicate locks and the back
- * pointer is clear, we can clean up the prior pointer for the next version.
+ * Because a particular target might become obsolete, due to update to a new
+ * version, before the reading transaction is obsolete, we need some way to
+ * prevent errors from reuse of a tuple ID. Rather than attempting to clean
+ * up the targets as the related tuples are pruned or vacuumed, we check the
+ * xmin on access. This should be far less costly.
*/
typedef struct PREDICATELOCKTARGET PREDICATELOCKTARGET;
@@ -277,15 +278,6 @@ struct PREDICATELOCKTARGET
/* data */
SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
* predicate lock target */
-
- /*
- * The following two pointers are only used for tuple locks, and are only
- * consulted for conflict detection and cleanup; not for granularity
- * promotion.
- */
- PREDICATELOCKTARGET *priorVersionOfRow; /* what other locks to check */
- PREDICATELOCKTARGET *nextVersionOfRow; /* who has pointer here for
- * more targets */
};
@@ -387,30 +379,32 @@ typedef struct PredicateLockData
(locktag).locktag_field2 = (reloid), \
(locktag).locktag_field3 = InvalidBlockNumber, \
(locktag).locktag_field4 = InvalidOffsetNumber, \
- (locktag).locktag_field5 = 0)
+ (locktag).locktag_field5 = InvalidTransactionId)
#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
((locktag).locktag_field1 = (dboid), \
(locktag).locktag_field2 = (reloid), \
(locktag).locktag_field3 = (blocknum), \
(locktag).locktag_field4 = InvalidOffsetNumber, \
- (locktag).locktag_field5 = 0)
+ (locktag).locktag_field5 = InvalidTransactionId)
-#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \
((locktag).locktag_field1 = (dboid), \
(locktag).locktag_field2 = (reloid), \
(locktag).locktag_field3 = (blocknum), \
(locktag).locktag_field4 = (offnum), \
- (locktag).locktag_field5 = 0)
+ (locktag).locktag_field5 = (xmin))
#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
- ((locktag).locktag_field1)
+ ((Oid) (locktag).locktag_field1)
#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
- ((locktag).locktag_field2)
+ ((Oid) (locktag).locktag_field2)
#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
- ((locktag).locktag_field3)
+ ((BlockNumber) (locktag).locktag_field3)
#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
- ((locktag).locktag_field4)
+ ((OffsetNumber) (locktag).locktag_field4)
+#define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \
+ ((TransactionId) (locktag).locktag_field5)
#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
(((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \