Fix bugs in Serializable Snapshot Isolation.

Change the way UPDATEs are handled. Instead of maintaining a chain of tuple-level locks in shared memory, copy any existing locks on the old tuple to the new tuple at UPDATE. Any existing page-level lock needs to be duplicated too, as a lock on the new tuple. That was neglected previously. Store xmin on tuple-level predicate locks, to distinguish a lock on an old already-recycled tuple from a new tuple at the same physical location. Failure to distinguish them caused loops in the tuple-lock chains, as reported by YAMAMOTO Takashi. Although we don't use the chain representation of UPDATEs anymore, it seems like a good idea to store the xmin to avoid some false positives if no other reason. CheckSingleTargetForConflictsIn now correctly handles the case where a lock that's being held is not reflected in the local lock table. That happens if another backend acquires a lock on our behalf due to an UPDATE or a page split. PredicateLockPageCombine now retains locks for the page that is being removed, rather than removing them. This prevents a potentially dangerous false-positive inconsistency where the local lock table believes that a lock is held, but it is actually not. Dan Ports and Kevin Grittner
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2011-03-01 19:05:16 +0200
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2011-03-01 19:05:16 +0200
commit: 47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6 (patch)
tree: b8a4ac5fa3cff5235a5693ca31825ecbcfdd97db
parent: 16143d64513e4dc3c72bad7ae98d3df0b5a23013 (diff)
download: postgresql-47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6.tar.gz
postgresql-47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6.zip
4 files changed, 148 insertions, 291 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index ba01874469b..7a0e1a9c25e 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -824,7 +824,6 @@ restart:
 	if (_bt_page_recyclable(page))
 	{
 		/* Okay to recycle this page */
-		Assert(!PageIsPredicateLocked(rel, blkno));
 		RecordFreeIndexPage(rel, blkno);
 		vstate->totFreePages++;
 		stats->pages_deleted++;
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d660ce5af78..aa657fab579 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -124,10 +124,6 @@
  *	SerializableXactHashLock
  *		- Protects both PredXact and SerializableXidHash.
  *
- *	PredicateLockNextRowLinkLock
- *		- Protects the priorVersionOfRow and nextVersionOfRow fields of
- *			PREDICATELOCKTARGET when linkage is being created or destroyed.
- *
  *
  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -444,8 +440,6 @@ static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
 						   bool summarize);
 static bool XidIsConcurrent(TransactionId xid);
 static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
-static bool CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
-						  PREDICATELOCKTARGETTAG *nexttargettag);
 static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
 static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
 										SERIALIZABLEXACT *writer);
@@ -1044,7 +1038,6 @@ InitPredicateLocks(void)
 		PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
 		PredXact->CanPartialClearThrough = 0;
 		PredXact->HavePartialClearedThrough = 0;
-		PredXact->NeedTargetLinkCleanup = false;
 		requestSize = mul_size((Size) max_table_size,
 							   PredXactListElementDataSize);
 		PredXact->element = ShmemAlloc(requestSize);
@@ -1651,9 +1644,10 @@ PageIsPredicateLocked(const Relation relation, const BlockNumber blkno)
  * Important note: this function may return false even if the lock is
  * being held, because it uses the local lock table which is not
  * updated if another transaction modifies our lock list (e.g. to
- * split an index page). However, it will never return true if the
- * lock is not held. We only use this function in circumstances where
- * such false negatives are acceptable.
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
  */
 static bool
 PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
@@ -1717,6 +1711,9 @@ GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
 /*
  * Check whether the lock we are considering is already covered by a
  * coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
  */
 static bool
 CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
@@ -1747,7 +1744,6 @@ static void
 RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
 {
 	PREDICATELOCKTARGET *rmtarget;
-	PREDICATELOCKTARGET *next;
 
 	Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
 
@@ -1755,33 +1751,6 @@ RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
 	if (!SHMQueueEmpty(&target->predicateLocks))
 		return;
 
-	/* Can't remove it if there are locks for a prior row version. */
-	LWLockAcquire(PredicateLockNextRowLinkLock, LW_EXCLUSIVE);
-	if (target->priorVersionOfRow != NULL)
-	{
-		LWLockRelease(PredicateLockNextRowLinkLock);
-		return;
-	}
-
-	/*
-	 * We are going to release this target,  This requires that we let the
-	 * next version of the row (if any) know that it's previous version is
-	 * done.
-	 *
-	 * It might be that the link was all that was keeping the other target
-	 * from cleanup, but we can't clean that up here -- LW locking is all
-	 * wrong for that.	We'll pass the HTAB in the general cleanup function to
-	 * get rid of such "dead" targets.
-	 */
-	next = target->nextVersionOfRow;
-	if (next != NULL)
-	{
-		next->priorVersionOfRow = NULL;
-		if (SHMQueueEmpty(&next->predicateLocks))
-			PredXact->NeedTargetLinkCleanup = true;
-	}
-	LWLockRelease(PredicateLockNextRowLinkLock);
-
 	/* Actually remove the target. */
 	rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
 										   &target->tag,
@@ -2065,11 +2034,7 @@ CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
 				 errmsg("out of shared memory"),
 				 errhint("You might need to increase max_pred_locks_per_transaction.")));
 	if (!found)
-	{
 		SHMQueueInit(&(target->predicateLocks));
-		target->priorVersionOfRow = NULL;
-		target->nextVersionOfRow = NULL;
-	}
 
 	/* We've got the sxact and target, make sure they're joined. */
 	locktag.myTarget = target;
@@ -2125,8 +2090,6 @@ PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
 		hash_search_with_hash_value(LocalPredicateLockHash,
 									targettag, targettaghash,
 									HASH_ENTER, &found);
-	/* We should not hold the lock (but its entry might still exist) */
-	Assert(!found || !locallock->held);
 	locallock->held = true;
 	if (!found)
 		locallock->childLocks = 0;
@@ -2215,6 +2178,7 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
 {
 	PREDICATELOCKTARGETTAG tag;
 	ItemPointer tid;
+	TransactionId targetxmin;
 
 	if (SkipSerialization(relation))
 		return;
@@ -2224,15 +2188,16 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
 	 */
 	if (relation->rd_index == NULL)
 	{
-		TransactionId myxid = GetTopTransactionIdIfAny();
+		TransactionId	myxid;
+
+		targetxmin = HeapTupleHeaderGetXmin(tuple->t_data);
 
+		myxid = GetTopTransactionIdIfAny();
 		if (TransactionIdIsValid(myxid))
 		{
-			TransactionId xid = HeapTupleHeaderGetXmin(tuple->t_data);
-
-			if (TransactionIdFollowsOrEquals(xid, TransactionXmin))
+			if (TransactionIdFollowsOrEquals(targetxmin, TransactionXmin))
 			{
-				xid = SubTransGetTopmostTransaction(xid);
+				TransactionId xid = SubTransGetTopmostTransaction(targetxmin);
 				if (TransactionIdEquals(xid, myxid))
 				{
 					/* We wrote it; we already have a write lock. */
@@ -2241,6 +2206,8 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
 			}
 		}
 	}
+	else
+		targetxmin = InvalidTransactionId;
 
 	/*
 	 * Do quick-but-not-definitive test for a relation lock first.	This will
@@ -2259,116 +2226,78 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
 									 relation->rd_node.dbNode,
 									 relation->rd_id,
 									 ItemPointerGetBlockNumber(tid),
-									 ItemPointerGetOffsetNumber(tid));
+									 ItemPointerGetOffsetNumber(tid),
+									 targetxmin);
 	PredicateLockAcquire(&tag);
 }
 
 /*
- * If the old tuple has any predicate locks, create a lock target for the
- * new tuple and point them at each other.	Conflict detection needs to
- * look for locks against prior versions of the row.
+ * If the old tuple has any predicate locks, copy them to the new target.
+ *
+ * This is called at an UPDATE, where any predicate locks held on the old
+ * tuple need to be copied to the new tuple, because logically they both
+ * represent the same row. A lock taken before the update must conflict
+ * with anyone locking the same row after the update.
  */
 void
 PredicateLockTupleRowVersionLink(const Relation relation,
 								 const HeapTuple oldTuple,
 								 const HeapTuple newTuple)
 {
-	PREDICATELOCKTARGETTAG oldtargettag;
-	PREDICATELOCKTARGETTAG newtargettag;
-	PREDICATELOCKTARGET *oldtarget;
-	PREDICATELOCKTARGET *newtarget;
-	PREDICATELOCKTARGET *next;
-	uint32		oldtargettaghash;
-	LWLockId	oldpartitionLock;
-	uint32		newtargettaghash;
-	LWLockId	newpartitionLock;
-	bool		found;
-
-	SET_PREDICATELOCKTARGETTAG_TUPLE(oldtargettag,
+	PREDICATELOCKTARGETTAG oldtupletag;
+	PREDICATELOCKTARGETTAG oldpagetag;
+	PREDICATELOCKTARGETTAG newtupletag;
+	BlockNumber	oldblk,
+				newblk;
+	OffsetNumber oldoff,
+				newoff;
+	TransactionId oldxmin,
+				newxmin;
+
+	oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
+	oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
+	oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
+
+	newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
+	newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
+	newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
+
+	SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
 									 relation->rd_node.dbNode,
 									 relation->rd_id,
-							  ItemPointerGetBlockNumber(&(oldTuple->t_self)),
-							ItemPointerGetOffsetNumber(&(oldTuple->t_self)));
-	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
-	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+									 oldblk,
+									 oldoff,
+									 oldxmin);
 
-	SET_PREDICATELOCKTARGETTAG_TUPLE(newtargettag,
+	SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
+									relation->rd_node.dbNode,
+									relation->rd_id,
+									oldblk);
+
+	SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
 									 relation->rd_node.dbNode,
 									 relation->rd_id,
-							  ItemPointerGetBlockNumber(&(newTuple->t_self)),
-							ItemPointerGetOffsetNumber(&(newTuple->t_self)));
-	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
-	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
-
-	/* Lock lower numbered partition first. */
-	if (oldpartitionLock < newpartitionLock)
-	{
-		LWLockAcquire(oldpartitionLock, LW_SHARED);
-		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-	}
-	else if (newpartitionLock < oldpartitionLock)
-	{
-		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-		LWLockAcquire(oldpartitionLock, LW_SHARED);
-	}
-	else
-		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
-
-	oldtarget = (PREDICATELOCKTARGET *)
-		hash_search_with_hash_value(PredicateLockTargetHash,
-									&oldtargettag, oldtargettaghash,
-									HASH_FIND, NULL);
-
-	/* Only need to link if there is an old target already. */
-	if (oldtarget)
-	{
-		LWLockAcquire(PredicateLockNextRowLinkLock, LW_EXCLUSIVE);
-
-		/* Guard against stale pointers from rollback. */
-		next = oldtarget->nextVersionOfRow;
-		if (next != NULL)
-		{
-			next->priorVersionOfRow = NULL;
-			oldtarget->nextVersionOfRow = NULL;
-		}
-
-		/* Find or create the new target, and link old and new. */
-		newtarget = (PREDICATELOCKTARGET *)
-			hash_search_with_hash_value(PredicateLockTargetHash,
-										&newtargettag, newtargettaghash,
-										HASH_ENTER, &found);
-		if (!newtarget)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of shared memory"),
-					 errhint("You might need to increase max_pred_locks_per_transaction.")));
-		if (!found)
-		{
-			SHMQueueInit(&(newtarget->predicateLocks));
-			newtarget->nextVersionOfRow = NULL;
-		}
-		else
-			Assert(newtarget->priorVersionOfRow == NULL);
+									 newblk,
+									 newoff,
+									 newxmin);
 
-		newtarget->priorVersionOfRow = oldtarget;
-		oldtarget->nextVersionOfRow = newtarget;
+	/*
+	 * A page-level lock on the page containing the old tuple counts too.
+	 * Anyone holding a lock on the page is logically holding a lock on
+	 * the old tuple, so we need to acquire a lock on his behalf on the
+	 * new tuple too. However, if the new tuple is on the same page as the
+	 * old one, the old page-level lock already covers the new tuple.
+	 *
+	 * A relation-level lock always covers both tuple versions, so we don't
+	 * need to worry about those here.
+	 */
+	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
 
-		LWLockRelease(PredicateLockNextRowLinkLock);
-	}
+	TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
+	if (newblk != oldblk)
+		TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
 
-	/* Release lower number partition last. */
-	if (oldpartitionLock < newpartitionLock)
-	{
-		LWLockRelease(newpartitionLock);
-		LWLockRelease(oldpartitionLock);
-	}
-	else if (newpartitionLock < oldpartitionLock)
-	{
-		LWLockRelease(oldpartitionLock);
-		LWLockRelease(newpartitionLock);
-	}
-	else
-		LWLockRelease(newpartitionLock);
+	LWLockRelease(SerializablePredicateLockListLock);
 }
 
 
@@ -2437,6 +2366,17 @@ DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
  * removeOld is set (by using the reserved entry in
  * PredicateLockTargetHash for scratch space).
  *
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
  * Caller must hold SerializablePredicateLockListLock.
  */
 static bool
@@ -2533,11 +2473,7 @@ TransferPredicateLocksToNewTarget(const PREDICATELOCKTARGETTAG oldtargettag,
 
 		/* If we created a new entry, initialize it */
 		if (!found)
-		{
 			SHMQueueInit(&(newtarget->predicateLocks));
-			newtarget->priorVersionOfRow = NULL;
-			newtarget->nextVersionOfRow = NULL;
-		}
 
 		newpredlocktag.myTarget = newtarget;
 
@@ -2704,7 +2640,14 @@ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
 											&newtargettag);
 		Assert(success);
 
-		/* Move the locks to the parent. This shouldn't fail.  */
+		/*
+		 * Move the locks to the parent. This shouldn't fail.
+		 *
+		 * Note that here we are removing locks held by other
+		 * backends, leading to a possible inconsistency in their
+		 * local lock hash table. This is OK because we're replacing
+		 * it with a lock that covers the old one.
+		 */
 		success = TransferPredicateLocksToNewTarget(oldtargettag,
 													newtargettag,
 													true);
@@ -2727,36 +2670,19 @@ void
 PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno,
 						 const BlockNumber newblkno)
 {
-	PREDICATELOCKTARGETTAG oldtargettag;
-	PREDICATELOCKTARGETTAG newtargettag;
-	bool		success;
-
-
-	if (SkipSplitTracking(relation))
-		return;
-
-	Assert(oldblkno != newblkno);
-	Assert(BlockNumberIsValid(oldblkno));
-	Assert(BlockNumberIsValid(newblkno));
-
-	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
-									relation->rd_node.dbNode,
-									relation->rd_id,
-									oldblkno);
-	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
-									relation->rd_node.dbNode,
-									relation->rd_id,
-									newblkno);
-
-	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
-
-	/* Move the locks. This shouldn't fail. */
-	success = TransferPredicateLocksToNewTarget(oldtargettag,
-												newtargettag,
-												true);
-	Assert(success);
-
-	LWLockRelease(SerializablePredicateLockListLock);
+	/*
+	 * Page combines differ from page splits in that we ought to be
+	 * able to remove the locks on the old page after transferring
+	 * them to the new page, instead of duplicating them. However,
+	 * because we can't edit other backends' local lock tables,
+	 * removing the old lock would leave them with an entry in their
+	 * LocalPredicateLockHash for a lock they're not holding, which
+	 * isn't acceptable. So we wind up having to do the same work as a
+	 * page split, acquiring a lock on the new page and keeping the old
+	 * page locked too. That can lead to some false positives, but
+	 * should be rare in practice.
+	 */
+	PredicateLockPageSplit(relation, oldblkno, newblkno);
 }
 
 /*
@@ -3132,9 +3058,6 @@ ClearOldPredicateLocks(void)
 {
 	SERIALIZABLEXACT *finishedSxact;
 	PREDICATELOCK *predlock;
-	int			i;
-	HASH_SEQ_STATUS seqstat;
-	PREDICATELOCKTARGET *locktarget;
 
 	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
 	finishedSxact = (SERIALIZABLEXACT *)
@@ -3232,35 +3155,6 @@ ClearOldPredicateLocks(void)
 
 	LWLockRelease(SerializablePredicateLockListLock);
 	LWLockRelease(SerializableFinishedListLock);
-
-	if (!PredXact->NeedTargetLinkCleanup)
-		return;
-
-	/*
-	 * Clean up any targets which were disconnected from a prior version with
-	 * no predicate locks attached.
-	 */
-	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
-		LWLockAcquire(FirstPredicateLockMgrLock + i, LW_EXCLUSIVE);
-	LWLockAcquire(PredicateLockNextRowLinkLock, LW_SHARED);
-
-	hash_seq_init(&seqstat, PredicateLockTargetHash);
-	while ((locktarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
-	{
-		if (SHMQueueEmpty(&locktarget->predicateLocks)
-			&& locktarget->priorVersionOfRow == NULL
-			&& locktarget->nextVersionOfRow == NULL)
-		{
-			hash_search(PredicateLockTargetHash, &locktarget->tag,
-						HASH_REMOVE, NULL);
-		}
-	}
-
-	PredXact->NeedTargetLinkCleanup = false;
-
-	LWLockRelease(PredicateLockNextRowLinkLock);
-	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
-		LWLockRelease(FirstPredicateLockMgrLock + i);
 }
 
 /*
@@ -3676,38 +3570,15 @@ CheckForSerializableConflictOut(const bool visible, const Relation relation,
 }
 
 /*
- * Check a particular target for rw-dependency conflict in. This will
- * also check prior versions of a tuple, if any.
+ * Check a particular target for rw-dependency conflict in.
  */
 static void
 CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
 {
-	PREDICATELOCKTARGETTAG nexttargettag = { 0 };
-	PREDICATELOCKTARGETTAG thistargettag;
-
-	for (;;)
-	{
-		if (!CheckSingleTargetForConflictsIn(targettag, &nexttargettag))
-			break;
-		thistargettag = nexttargettag;
-		targettag = &thistargettag;
-	}
-}
-
-/*
- * Check a particular target for rw-dependency conflict in. If the tuple
- * has prior versions, returns true and *nexttargettag is set to the tag
- * of the prior tuple version.
- */
-static bool
-CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
-								PREDICATELOCKTARGETTAG *nexttargettag)
-{
 	uint32		targettaghash;
 	LWLockId	partitionLock;
 	PREDICATELOCKTARGET *target;
 	PREDICATELOCK *predlock;
-	bool		hasnexttarget = false;
 
 	Assert(MySerializableXact != InvalidSerializableXact);
 
@@ -3717,7 +3588,6 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
 	targettaghash = PredicateLockTargetTagHashCode(targettag);
 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
 	LWLockAcquire(partitionLock, LW_SHARED);
-	LWLockAcquire(PredicateLockNextRowLinkLock, LW_SHARED);
 	target = (PREDICATELOCKTARGET *)
 		hash_search_with_hash_value(PredicateLockTargetHash,
 									targettag, targettaghash,
@@ -3725,21 +3595,9 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
 	if (!target)
 	{
 		/* Nothing has this target locked; we're done here. */
-		LWLockRelease(PredicateLockNextRowLinkLock);
 		LWLockRelease(partitionLock);
-		return false;
-	}
-
-	/*
-	 * If the target is linked to a prior version of the row, save the tag so
-	 * that it can be used for iterative calls to this function.
-	 */
-	if (target->priorVersionOfRow != NULL)
-	{
-		*nexttargettag = target->priorVersionOfRow->tag;
-		hasnexttarget = true;
+		return;
 	}
-	LWLockRelease(PredicateLockNextRowLinkLock);
 
 	/*
 	 * Each lock for an overlapping transaction represents a conflict: a
@@ -3828,17 +3686,25 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
 						hash_search_with_hash_value(LocalPredicateLockHash,
 													targettag, targettaghash,
 													HASH_FIND, NULL);
-					Assert(locallock != NULL);
-					Assert(locallock->held);
-					locallock->held = false;
 
-					if (locallock->childLocks == 0)
+					/*
+					 * Remove entry in local lock table if it exists and has
+					 * no children. It's OK if it doesn't exist; that means
+					 * the lock was transferred to a new target by a
+					 * different backend.
+					 */
+					if (locallock != NULL)
 					{
-						rmlocallock = (LOCALPREDICATELOCK *)
-							hash_search_with_hash_value(LocalPredicateLockHash,
-													targettag, targettaghash,
-														HASH_REMOVE, NULL);
-						Assert(rmlocallock == locallock);
+						locallock->held = false;
+
+						if (locallock->childLocks == 0)
+						{
+							rmlocallock = (LOCALPREDICATELOCK *)
+								hash_search_with_hash_value(LocalPredicateLockHash,
+															targettag, targettaghash,
+															HASH_REMOVE, NULL);
+							Assert(rmlocallock == locallock);
+						}
 					}
 
 					DecrementParentLocks(targettag);
@@ -3848,7 +3714,7 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
 					 * the target, bail out before re-acquiring the locks.
 					 */
 					if (rmtarget)
-						return hasnexttarget;
+						return;
 
 					/*
 					 * The list has been altered.  Start over at the front.
@@ -3895,8 +3761,6 @@ CheckSingleTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag,
 	}
 	LWLockRelease(SerializableXactHashLock);
 	LWLockRelease(partitionLock);
-
-	return hasnexttarget;
 }
 
 /*
@@ -3943,7 +3807,8 @@ CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple,
 										 relation->rd_node.dbNode,
 										 relation->rd_id,
 						 ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)),
-					   ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)));
+					   ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)),
+					   HeapTupleHeaderGetXmin(tuple->t_data));
 		CheckTargetForConflictsIn(&targettag);
 	}
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 99fe37ef83b..ad0bcd775b0 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -78,7 +78,6 @@ typedef enum LWLockId
 	SerializableFinishedListLock,
 	SerializablePredicateLockListLock,
 	OldSerXidLock,
-	PredicateLockNextRowLinkLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h
index 32e9a1bc218..05561256367 100644
--- a/src/include/storage/predicate_internals.h
+++ b/src/include/storage/predicate_internals.h
@@ -150,8 +150,6 @@ typedef struct PredXactListData
 	SerCommitSeqNo HavePartialClearedThrough;	/* have cleared through this
 												 * seq no */
 	SERIALIZABLEXACT *OldCommittedSxact;		/* shared copy of dummy sxact */
-	bool		NeedTargetLinkCleanup;	/* to save cleanup effort for rare
-										 * case */
 
 	PredXactListElement element;
 } PredXactListData;
@@ -231,9 +229,13 @@ typedef struct SERIALIZABLEXID
 
 /*
  * The PREDICATELOCKTARGETTAG struct identifies a database object which can
- * be the target of predicate locks.  It is designed to fit into 16 bytes
- * with no padding.  Note that this would need adjustment if we widen Oid or
- * BlockNumber to more than 32 bits.
+ * be the target of predicate locks.
+ *
+ * Note that the hash function being used doesn't properly respect tag
+ * length -- it will go to a four byte boundary past the end of the tag.
+ * If you change this struct, make sure any slack space is initialized,
+ * so that any random bytes in the middle or at the end are not included
+ * in the hash.
  *
  * TODO SSI: If we always use the same fields for the same type of value, we
  * should rename these.  Holding off until it's clear there are no exceptions.
@@ -247,8 +249,8 @@ typedef struct PREDICATELOCKTARGETTAG
 	uint32		locktag_field1; /* a 32-bit ID field */
 	uint32		locktag_field2; /* a 32-bit ID field */
 	uint32		locktag_field3; /* a 32-bit ID field */
-	uint16		locktag_field4; /* a 16-bit ID field */
-	uint16		locktag_field5; /* a 16-bit ID field */
+	uint32		locktag_field4; /* a 32-bit ID field */
+	uint32		locktag_field5; /* a 32-bit ID field */
 } PREDICATELOCKTARGETTAG;
 
 /*
@@ -260,12 +262,11 @@ typedef struct PREDICATELOCKTARGETTAG
  * already have one.  An entry is removed when the last lock is removed from
  * its list.
  *
- * Because a check for predicate locks on a tuple target should also find
- * locks on previous versions of the same row, if there are any created by
- * overlapping transactions, we keep a pointer to the target for the prior
- * version of the row.	We also keep a pointer to the next version of the
- * row, so that when we no longer have any predicate locks and the back
- * pointer is clear, we can clean up the prior pointer for the next version.
+ * Because a particular target might become obsolete, due to update to a new
+ * version, before the reading transaction is obsolete, we need some way to
+ * prevent errors from reuse of a tuple ID.  Rather than attempting to clean
+ * up the targets as the related tuples are pruned or vacuumed, we check the
+ * xmin on access.  This should be far less costly.
  */
 typedef struct PREDICATELOCKTARGET PREDICATELOCKTARGET;
 
@@ -277,15 +278,6 @@ struct PREDICATELOCKTARGET
 	/* data */
 	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
 								 * predicate lock target */
-
-	/*
-	 * The following two pointers are only used for tuple locks, and are only
-	 * consulted for conflict detection and cleanup; not for granularity
-	 * promotion.
-	 */
-	PREDICATELOCKTARGET *priorVersionOfRow;		/* what other locks to check */
-	PREDICATELOCKTARGET *nextVersionOfRow;		/* who has pointer here for
-												 * more targets */
 };
 
 
@@ -387,30 +379,32 @@ typedef struct PredicateLockData
 	 (locktag).locktag_field2 = (reloid), \
 	 (locktag).locktag_field3 = InvalidBlockNumber, \
 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
-	 (locktag).locktag_field5 = 0)
+	 (locktag).locktag_field5 = InvalidTransactionId)
 
 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
 	((locktag).locktag_field1 = (dboid), \
 	 (locktag).locktag_field2 = (reloid), \
 	 (locktag).locktag_field3 = (blocknum), \
 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
-	 (locktag).locktag_field5 = 0)
+	 (locktag).locktag_field5 = InvalidTransactionId)
 
-#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \
 	((locktag).locktag_field1 = (dboid), \
 	 (locktag).locktag_field2 = (reloid), \
 	 (locktag).locktag_field3 = (blocknum), \
 	 (locktag).locktag_field4 = (offnum), \
-	 (locktag).locktag_field5 = 0)
+	 (locktag).locktag_field5 = (xmin))
 
 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
-	((locktag).locktag_field1)
+	((Oid) (locktag).locktag_field1)
 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
-	((locktag).locktag_field2)
+	((Oid) (locktag).locktag_field2)
 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
-	((locktag).locktag_field3)
+	((BlockNumber) (locktag).locktag_field3)
 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
-	((locktag).locktag_field4)
+	((OffsetNumber) (locktag).locktag_field4)
+#define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \
+	((TransactionId) (locktag).locktag_field5)
 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
 	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
 	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2011-03-01 19:05:16 +0200
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2011-03-01 19:05:16 +0200
commit	47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6 (patch)
tree	b8a4ac5fa3cff5235a5693ca31825ecbcfdd97db
parent	16143d64513e4dc3c72bad7ae98d3df0b5a23013 (diff)
download	postgresql-47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6.tar.gz postgresql-47ad79122bc099c1f0ea8a7ae413fcd8d45e26a6.zip