1 files changed, 152 insertions, 48 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9c9bda5035c..8d40e8d952f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.92 2000/10/28 16:20:55 vadim Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.93 2000/11/08 22:09:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -709,23 +709,28 @@ refcount = %ld, file: %s, line: %d\n",
 #endif
 
 /*
- * FlushBuffer -- like WriteBuffer, but force the page to disk.
+ * FlushBuffer -- like WriteBuffer, but write the page immediately,
+ * rather than just marking it dirty.  On success return, the buffer will
+ * no longer be dirty.
  *
  * 'buffer' is known to be dirty/pinned, so there should not be a
  * problem reading the BufferDesc members without the BufMgrLock
  * (nobody should be able to change tags out from under us).
  *
- * Unpin if 'release' is TRUE.
+ * If 'sync' is true, a synchronous write is wanted (wait for buffer to hit
+ * the disk).  Otherwise it's sufficient to issue the kernel write call.
+ *
+ * Unpin buffer if 'release' is true.
  */
 int
-FlushBuffer(Buffer buffer, bool release)
+FlushBuffer(Buffer buffer, bool sync, bool release)
 {
 	BufferDesc *bufHdr;
 	Relation	bufrel;
 	int			status;
 
 	if (BufferIsLocal(buffer))
-		return FlushLocalBuffer(buffer, release) ? STATUS_OK : STATUS_ERROR;
+		return FlushLocalBuffer(buffer, sync, release) ? STATUS_OK : STATUS_ERROR;
 
 	if (BAD_BUFFER_ID(buffer))
 		return STATUS_ERROR;
@@ -755,12 +760,16 @@ FlushBuffer(Buffer buffer, bool release)
 	 */
 	LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_SHARE);
 
-	status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
-					   (char *) MAKE_PTR(bufHdr->data));
+	if (sync)
+		status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
+						   (char *) MAKE_PTR(bufHdr->data));
+	else
+		status = smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
+						   (char *) MAKE_PTR(bufHdr->data));
 
 	LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_UNLOCK);
 
-	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
 	RelationDecrementReferenceCount(bufrel);
 
 	if (status == SM_FAIL)
@@ -926,7 +935,7 @@ SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
 
 			/*
 			 * drop relcache refcnt incremented by
-			 * RelationIdCacheGetRelation
+			 * RelationNodeCacheGetRelation
 			 */
 			RelationDecrementReferenceCount(reln);
 		}
@@ -1123,7 +1132,7 @@ BufferSync()
 						bufHdr->flags &= ~BM_DIRTY;
 				}
 
-				/* drop refcnt obtained by RelationIdCacheGetRelation */
+				/* drop refcnt obtained by RelationNodeCacheGetRelation */
 				if (reln != (Relation) NULL)
 					RelationDecrementReferenceCount(reln);
 			}
@@ -1154,7 +1163,7 @@ BufferSync()
 
 				/*
 				 * drop relcache refcnt incremented by
-				 * RelationIdCacheGetRelation
+				 * RelationNodeCacheGetRelation
 				 */
 				RelationDecrementReferenceCount(reln);
 
@@ -1458,7 +1467,7 @@ BufferReplace(BufferDesc *bufHdr)
 
 	SpinAcquire(BufMgrLock);
 
-	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
 	if (reln != (Relation) NULL)
 		RelationDecrementReferenceCount(reln);
 
@@ -1495,21 +1504,23 @@ RelationGetNumberOfBlocks(Relation relation)
 }
 
 /* ---------------------------------------------------------------------
- *		ReleaseRelationBuffers
+ *		DropRelationBuffers
  *
  *		This function removes all the buffered pages for a relation
  *		from the buffer pool.  Dirty pages are simply dropped, without
- *		bothering to write them out first.  This is used when the
- *		relation is about to be deleted.  We assume that the caller
- *		holds an exclusive lock on the relation, which should assure
- *		that no new buffers will be acquired for the rel meanwhile.
+ *		bothering to write them out first.  This is NOT rollback-able,
+ *		and so should be used only with extreme caution!
+ *
+ *		We assume that the caller holds an exclusive lock on the relation,
+ *		which should assure that no new buffers will be acquired for the rel
+ *		meanwhile.
  *
  *		XXX currently it sequentially searches the buffer pool, should be
  *		changed to more clever ways of searching.
  * --------------------------------------------------------------------
  */
 void
-ReleaseRelationBuffers(Relation rel)
+DropRelationBuffers(Relation rel)
 {
 	int			i;
 	BufferDesc *bufHdr;
@@ -1589,7 +1600,104 @@ recheck:
 		 * this rel, since we hold exclusive lock on this rel.
 		 */
 		if (RelFileNodeEquals(rel->rd_node, 
-					  BufferTagLastDirtied[i - 1].rnode))
+							  BufferTagLastDirtied[i - 1].rnode))
+			BufferDirtiedByMe[i - 1] = false;
+	}
+
+	SpinRelease(BufMgrLock);
+}
+
+/* ---------------------------------------------------------------------
+ *		DropRelFileNodeBuffers
+ *
+ *		This is the same as DropRelationBuffers, except that the target
+ *		relation is specified by RelFileNode.
+ *
+ *		This is NOT rollback-able.  One legitimate use is to clear the
+ *		buffer cache of buffers for a relation that is being deleted
+ *		during transaction abort.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeBuffers(RelFileNode rnode)
+{
+	int			i;
+	BufferDesc *bufHdr;
+
+	/* We have to search both local and shared buffers... */
+
+	for (i = 0; i < NLocBuffer; i++)
+	{
+		bufHdr = &LocalBufferDescriptors[i];
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+			LocalRefCount[i] = 0;
+			bufHdr->tag.rnode.relNode = InvalidOid;
+		}
+	}
+
+	SpinAcquire(BufMgrLock);
+	for (i = 1; i <= NBuffers; i++)
+	{
+		bufHdr = &BufferDescriptors[i - 1];
+recheck:
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+
+			/*
+			 * If there is I/O in progress, better wait till it's done;
+			 * don't want to delete the relation out from under someone
+			 * who's just trying to flush the buffer!
+			 */
+			if (bufHdr->flags & BM_IO_IN_PROGRESS)
+			{
+				WaitIO(bufHdr, BufMgrLock);
+
+				/*
+				 * By now, the buffer very possibly belongs to some other
+				 * rel, so check again before proceeding.
+				 */
+				goto recheck;
+			}
+			/* Now we can do what we came for */
+			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+
+			/*
+			 * Release any refcount we may have.
+			 *
+			 * This is very probably dead code, and if it isn't then it's
+			 * probably wrong.	I added the Assert to find out --- tgl
+			 * 11/99.
+			 */
+			if (!(bufHdr->flags & BM_FREE))
+			{
+				/* Assert checks that buffer will actually get freed! */
+				Assert(PrivateRefCount[i - 1] == 1 &&
+					   bufHdr->refcount == 1);
+				/* ReleaseBuffer expects we do not hold the lock at entry */
+				SpinRelease(BufMgrLock);
+				ReleaseBuffer(i);
+				SpinAcquire(BufMgrLock);
+			}
+			/*
+			 * And mark the buffer as no longer occupied by this rel.
+			 */
+			BufTableDelete(bufHdr);
+		}
+
+		/*
+		 * Also check to see if BufferDirtiedByMe info for this buffer
+		 * refers to the target relation, and clear it if so.  This is
+		 * independent of whether the current contents of the buffer
+		 * belong to the target relation!
+		 *
+		 * NOTE: we have no way to clear BufferDirtiedByMe info in other
+		 * backends, but hopefully there are none with that bit set for
+		 * this rel, since we hold exclusive lock on this rel.
+		 */
+		if (RelFileNodeEquals(rnode, 
+							  BufferTagLastDirtied[i - 1].rnode))
 			BufferDirtiedByMe[i - 1] = false;
 	}
 
@@ -1604,7 +1712,7 @@ recheck:
  *		bothering to write them out first.  This is used when we destroy a
  *		database, to avoid trying to flush data to disk when the directory
  *		tree no longer exists.	Implementation is pretty similar to
- *		ReleaseRelationBuffers() which is for destroying just one relation.
+ *		DropRelationBuffers() which is for destroying just one relation.
  * --------------------------------------------------------------------
  */
 void
@@ -1757,33 +1865,32 @@ BufferPoolBlowaway()
 /* ---------------------------------------------------------------------
  *		FlushRelationBuffers
  *
- *		This function flushes all dirty pages of a relation out to disk.
+ *		This function writes all dirty pages of a relation out to disk.
  *		Furthermore, pages that have blocknumber >= firstDelBlock are
  *		actually removed from the buffer pool.  An error code is returned
  *		if we fail to dump a dirty buffer or if we find one of
  *		the target pages is pinned into the cache.
  *
- *		This is used by VACUUM before truncating the relation to the given
- *		number of blocks.  (TRUNCATE TABLE also uses it in the same way.)
- *		It might seem unnecessary to flush dirty pages before firstDelBlock,
- *		since VACUUM should already have committed its changes.  However,
- *		it is possible for there still to be dirty pages: if some page
- *		had unwritten on-row tuple status updates from a prior transaction,
- *		and VACUUM had no additional changes to make to that page, then
- *		VACUUM won't have written it.  This is harmless in most cases but
- *		will break pg_upgrade, which relies on VACUUM to ensure that *all*
- *		tuples have correct on-row status.  So, we check and flush all
- *		dirty pages of the rel regardless of block number.
- *
- *		This is also used by RENAME TABLE (with firstDelBlock = 0)
- *		to clear out the buffer cache before renaming the physical files of
- *		a relation.  Without that, some other backend might try to do a
- *		blind write of a buffer page (relying on the BlindId of the buffer)
- *		and fail because it's not got the right filename anymore.
+ *		This is called by DROP TABLE to clear buffers for the relation
+ *		from the buffer pool.  Note that we must write dirty buffers,
+ *		rather than just dropping the changes, because our transaction
+ *		might abort later on; we want to roll back safely in that case.
+ *
+ *		This is also called by VACUUM before truncating the relation to the
+ *		given number of blocks.  It might seem unnecessary for VACUUM to
+ *		write dirty pages before firstDelBlock, since VACUUM should already
+ *		have committed its changes.  However, it is possible for there still
+ *		to be dirty pages: if some page had unwritten on-row tuple status
+ *		updates from a prior transaction, and VACUUM had no additional
+ *		changes to make to that page, then VACUUM won't have written it.
+ *		This is harmless in most cases but will break pg_upgrade, which
+ *		relies on VACUUM to ensure that *all* tuples have correct on-row
+ *		status.  So, we check and flush all dirty pages of the rel
+ *		regardless of block number.
  *
  *		In all cases, the caller should be holding AccessExclusiveLock on
  *		the target relation to ensure that no other backend is busy reading
- *		more blocks of the relation.
+ *		more blocks of the relation (or might do so before we commit).
  *
  *		Formerly, we considered it an error condition if we found dirty
  *		buffers here.	However, since BufferSync no longer forces out all
@@ -1812,7 +1919,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 			{
 				if (bufHdr->flags & BM_DIRTY)
 				{
-					if (FlushBuffer(-i - 1, false) != STATUS_OK)
+					if (FlushBuffer(-i - 1, false, false) != STATUS_OK)
 					{
 						elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
 							 RelationGetRelationName(rel), firstDelBlock,
@@ -1840,15 +1947,17 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 	for (i = 0; i < NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i];
-recheck:
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
 		{
 			if (bufHdr->flags & BM_DIRTY)
 			{
 				PinBuffer(bufHdr);
 				SpinRelease(BufMgrLock);
-				if (FlushBuffer(i + 1, true) != STATUS_OK)
+				if (FlushBuffer(i + 1, false, false) != STATUS_OK)
 				{
+					SpinAcquire(BufMgrLock);
+					UnpinBuffer(bufHdr);
+					SpinRelease(BufMgrLock);
 					elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
 						 RelationGetRelationName(rel), firstDelBlock,
 						 bufHdr->tag.blockNum,
@@ -1856,12 +1965,7 @@ recheck:
 					return -1;
 				}
 				SpinAcquire(BufMgrLock);
-
-				/*
-				 * Buffer could already be reassigned, so must recheck
-				 * whether it still belongs to rel before freeing it!
-				 */
-				goto recheck;
+				UnpinBuffer(bufHdr);
 			}
 			if (!(bufHdr->flags & BM_FREE))
 			{