diff options
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 200 |
1 files changed, 152 insertions, 48 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9c9bda5035c..8d40e8d952f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.92 2000/10/28 16:20:55 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.93 2000/11/08 22:09:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -709,23 +709,28 @@ refcount = %ld, file: %s, line: %d\n", #endif /* - * FlushBuffer -- like WriteBuffer, but force the page to disk. + * FlushBuffer -- like WriteBuffer, but write the page immediately, + * rather than just marking it dirty. On success return, the buffer will + * no longer be dirty. * * 'buffer' is known to be dirty/pinned, so there should not be a * problem reading the BufferDesc members without the BufMgrLock * (nobody should be able to change tags out from under us). * - * Unpin if 'release' is TRUE. + * If 'sync' is true, a synchronous write is wanted (wait for buffer to hit + * the disk). Otherwise it's sufficient to issue the kernel write call. + * + * Unpin buffer if 'release' is true. */ int -FlushBuffer(Buffer buffer, bool release) +FlushBuffer(Buffer buffer, bool sync, bool release) { BufferDesc *bufHdr; Relation bufrel; int status; if (BufferIsLocal(buffer)) - return FlushLocalBuffer(buffer, release) ? STATUS_OK : STATUS_ERROR; + return FlushLocalBuffer(buffer, sync, release) ? STATUS_OK : STATUS_ERROR; if (BAD_BUFFER_ID(buffer)) return STATUS_ERROR; @@ -755,12 +760,16 @@ FlushBuffer(Buffer buffer, bool release) */ LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_SHARE); - status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + if (sync) + status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + else + status = smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_UNLOCK); - /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(bufrel); if (status == SM_FAIL) @@ -926,7 +935,7 @@ SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) /* * drop relcache refcnt incremented by - * RelationIdCacheGetRelation + * RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(reln); } @@ -1123,7 +1132,7 @@ BufferSync() bufHdr->flags &= ~BM_DIRTY; } - /* drop refcnt obtained by RelationIdCacheGetRelation */ + /* drop refcnt obtained by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); } @@ -1154,7 +1163,7 @@ BufferSync() /* * drop relcache refcnt incremented by - * RelationIdCacheGetRelation + * RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(reln); @@ -1458,7 +1467,7 @@ BufferReplace(BufferDesc *bufHdr) SpinAcquire(BufMgrLock); - /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); @@ -1495,21 +1504,23 @@ RelationGetNumberOfBlocks(Relation relation) } /* --------------------------------------------------------------------- - * ReleaseRelationBuffers + * DropRelationBuffers * * This function removes all the buffered pages for a relation * from the buffer pool. Dirty pages are simply dropped, without - * bothering to write them out first. This is used when the - * relation is about to be deleted. We assume that the caller - * holds an exclusive lock on the relation, which should assure - * that no new buffers will be acquired for the rel meanwhile. + * bothering to write them out first. This is NOT rollback-able, + * and so should be used only with extreme caution! + * + * We assume that the caller holds an exclusive lock on the relation, + * which should assure that no new buffers will be acquired for the rel + * meanwhile. * * XXX currently it sequentially searches the buffer pool, should be * changed to more clever ways of searching. * -------------------------------------------------------------------- */ void -ReleaseRelationBuffers(Relation rel) +DropRelationBuffers(Relation rel) { int i; BufferDesc *bufHdr; @@ -1589,7 +1600,104 @@ recheck: * this rel, since we hold exclusive lock on this rel. */ if (RelFileNodeEquals(rel->rd_node, - BufferTagLastDirtied[i - 1].rnode)) + BufferTagLastDirtied[i - 1].rnode)) + BufferDirtiedByMe[i - 1] = false; + } + + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- + * DropRelFileNodeBuffers + * + * This is the same as DropRelationBuffers, except that the target + * relation is specified by RelFileNode. + * + * This is NOT rollback-able. One legitimate use is to clear the + * buffer cache of buffers for a relation that is being deleted + * during transaction abort. + * -------------------------------------------------------------------- + */ +void +DropRelFileNodeBuffers(RelFileNode rnode) +{ + int i; + BufferDesc *bufHdr; + + /* We have to search both local and shared buffers... */ + + for (i = 0; i < NLocBuffer; i++) + { + bufHdr = &LocalBufferDescriptors[i]; + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + { + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + LocalRefCount[i] = 0; + bufHdr->tag.rnode.relNode = InvalidOid; + } + } + + SpinAcquire(BufMgrLock); + for (i = 1; i <= NBuffers; i++) + { + bufHdr = &BufferDescriptors[i - 1]; +recheck: + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + { + + /* + * If there is I/O in progress, better wait till it's done; + * don't want to delete the relation out from under someone + * who's just trying to flush the buffer! + */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + { + WaitIO(bufHdr, BufMgrLock); + + /* + * By now, the buffer very possibly belongs to some other + * rel, so check again before proceeding. + */ + goto recheck; + } + /* Now we can do what we came for */ + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + + /* + * Release any refcount we may have. + * + * This is very probably dead code, and if it isn't then it's + * probably wrong. I added the Assert to find out --- tgl + * 11/99. + */ + if (!(bufHdr->flags & BM_FREE)) + { + /* Assert checks that buffer will actually get freed! */ + Assert(PrivateRefCount[i - 1] == 1 && + bufHdr->refcount == 1); + /* ReleaseBuffer expects we do not hold the lock at entry */ + SpinRelease(BufMgrLock); + ReleaseBuffer(i); + SpinAcquire(BufMgrLock); + } + /* + * And mark the buffer as no longer occupied by this rel. + */ + BufTableDelete(bufHdr); + } + + /* + * Also check to see if BufferDirtiedByMe info for this buffer + * refers to the target relation, and clear it if so. This is + * independent of whether the current contents of the buffer + * belong to the target relation! + * + * NOTE: we have no way to clear BufferDirtiedByMe info in other + * backends, but hopefully there are none with that bit set for + * this rel, since we hold exclusive lock on this rel. + */ + if (RelFileNodeEquals(rnode, + BufferTagLastDirtied[i - 1].rnode)) BufferDirtiedByMe[i - 1] = false; } @@ -1604,7 +1712,7 @@ recheck: * bothering to write them out first. This is used when we destroy a * database, to avoid trying to flush data to disk when the directory * tree no longer exists. Implementation is pretty similar to - * ReleaseRelationBuffers() which is for destroying just one relation. + * DropRelationBuffers() which is for destroying just one relation. * -------------------------------------------------------------------- */ void @@ -1757,33 +1865,32 @@ BufferPoolBlowaway() /* --------------------------------------------------------------------- * FlushRelationBuffers * - * This function flushes all dirty pages of a relation out to disk. + * This function writes all dirty pages of a relation out to disk. * Furthermore, pages that have blocknumber >= firstDelBlock are * actually removed from the buffer pool. An error code is returned * if we fail to dump a dirty buffer or if we find one of * the target pages is pinned into the cache. * - * This is used by VACUUM before truncating the relation to the given - * number of blocks. (TRUNCATE TABLE also uses it in the same way.) - * It might seem unnecessary to flush dirty pages before firstDelBlock, - * since VACUUM should already have committed its changes. However, - * it is possible for there still to be dirty pages: if some page - * had unwritten on-row tuple status updates from a prior transaction, - * and VACUUM had no additional changes to make to that page, then - * VACUUM won't have written it. This is harmless in most cases but - * will break pg_upgrade, which relies on VACUUM to ensure that *all* - * tuples have correct on-row status. So, we check and flush all - * dirty pages of the rel regardless of block number. - * - * This is also used by RENAME TABLE (with firstDelBlock = 0) - * to clear out the buffer cache before renaming the physical files of - * a relation. Without that, some other backend might try to do a - * blind write of a buffer page (relying on the BlindId of the buffer) - * and fail because it's not got the right filename anymore. + * This is called by DROP TABLE to clear buffers for the relation + * from the buffer pool. Note that we must write dirty buffers, + * rather than just dropping the changes, because our transaction + * might abort later on; we want to roll back safely in that case. + * + * This is also called by VACUUM before truncating the relation to the + * given number of blocks. It might seem unnecessary for VACUUM to + * write dirty pages before firstDelBlock, since VACUUM should already + * have committed its changes. However, it is possible for there still + * to be dirty pages: if some page had unwritten on-row tuple status + * updates from a prior transaction, and VACUUM had no additional + * changes to make to that page, then VACUUM won't have written it. + * This is harmless in most cases but will break pg_upgrade, which + * relies on VACUUM to ensure that *all* tuples have correct on-row + * status. So, we check and flush all dirty pages of the rel + * regardless of block number. * * In all cases, the caller should be holding AccessExclusiveLock on * the target relation to ensure that no other backend is busy reading - * more blocks of the relation. + * more blocks of the relation (or might do so before we commit). * * Formerly, we considered it an error condition if we found dirty * buffers here. However, since BufferSync no longer forces out all @@ -1812,7 +1919,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) { if (bufHdr->flags & BM_DIRTY) { - if (FlushBuffer(-i - 1, false) != STATUS_OK) + if (FlushBuffer(-i - 1, false, false) != STATUS_OK) { elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", RelationGetRelationName(rel), firstDelBlock, @@ -1840,15 +1947,17 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) for (i = 0; i < NBuffers; i++) { bufHdr = &BufferDescriptors[i]; -recheck: if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) { if (bufHdr->flags & BM_DIRTY) { PinBuffer(bufHdr); SpinRelease(BufMgrLock); - if (FlushBuffer(i + 1, true) != STATUS_OK) + if (FlushBuffer(i + 1, false, false) != STATUS_OK) { + SpinAcquire(BufMgrLock); + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", RelationGetRelationName(rel), firstDelBlock, bufHdr->tag.blockNum, @@ -1856,12 +1965,7 @@ recheck: return -1; } SpinAcquire(BufMgrLock); - - /* - * Buffer could already be reassigned, so must recheck - * whether it still belongs to rel before freeing it! - */ - goto recheck; + UnpinBuffer(bufHdr); } if (!(bufHdr->flags & BM_FREE)) { |