diff options
author | Vadim B. Mikheev <vadim4o@yahoo.com> | 2000-11-30 08:46:26 +0000 |
---|---|---|
committer | Vadim B. Mikheev <vadim4o@yahoo.com> | 2000-11-30 08:46:26 +0000 |
commit | 81c8c244b26011a071c89b43a38bba7039226019 (patch) | |
tree | a0602e39901d870d1fe4275a96c70a8450710882 /src/backend/storage/buffer | |
parent | b16516b887f058782d67c90103148544f8adbd8f (diff) | |
download | postgresql-81c8c244b26011a071c89b43a38bba7039226019.tar.gz postgresql-81c8c244b26011a071c89b43a38bba7039226019.zip |
No more #ifdef XLOG.
Diffstat (limited to 'src/backend/storage/buffer')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 731 | ||||
-rw-r--r-- | src/backend/storage/buffer/xlog_bufmgr.c | 2202 | ||||
-rw-r--r-- | src/backend/storage/buffer/xlog_localbuf.c | 284 |
3 files changed, 199 insertions, 3018 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a25d4d9a55b..9400da38058 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * bufmgr.c + * xlog_bufmgr.c * buffer manager interface routines * * Portions Copyright (c) 1996-2000, PostgreSQL, Inc @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.96 2000/11/30 01:39:07 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.97 2000/11/30 08:46:23 vadim Exp $ * *------------------------------------------------------------------------- */ @@ -31,9 +31,6 @@ * * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() * - * FlushBuffer() -- Write buffer immediately. Can unpin, or not, - * depending on parameter. - * * BufferSync() -- flush all dirty buffers in the buffer pool. * * InitBufferPool() -- Init the buffer module. @@ -42,13 +39,8 @@ * freelist.c -- chooses victim for buffer replacement * buf_table.c -- manages the buffer lookup table */ - #include "postgres.h" -#ifdef XLOG -#include "xlog_bufmgr.c" -#else - #include <sys/types.h> #include <sys/file.h> #include <math.h> @@ -61,10 +53,11 @@ #include "storage/s_lock.h" #include "storage/smgr.h" #include "utils/relcache.h" - -#ifdef XLOG #include "catalog/pg_database.h" -#endif + +#define BufferGetLSN(bufHdr) \ + (*((XLogRecPtr*)MAKE_PTR((bufHdr)->data))) + extern SPINLOCK BufMgrLock; extern long int ReadBufferCount; @@ -99,9 +92,6 @@ static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, bool bufferLockHeld); static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr, bool bufferLockHeld); -static void SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr); -static void ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr); -static void BufferSync(void); static int BufferReplace(BufferDesc *bufHdr); void PrintBufferDescs(void); @@ -170,48 +160,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum) } /* - * is_userbuffer - * - * XXX caller must have already acquired BufMgrLock - */ -#ifdef NOT_USED -static bool -is_userbuffer(Buffer buffer) -{ - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - if (IsSystemRelationName(buf->blind.relname)) - return false; - return true; -} - -#endif - -#ifdef NOT_USED -Buffer -ReadBuffer_Debug(char *file, - int line, - Relation reln, - BlockNumber blockNum) -{ - Buffer buffer; - - buffer = ReadBufferWithBufferLock(reln, blockNum, false); - if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } - return buffer; -} - -#endif - -/* * ReadBufferWithBufferLock -- does the work of * ReadBuffer() but with the possibility that * the buffer lock has already been held. this @@ -447,7 +395,7 @@ BufferAlloc(Relation reln, buf->refcount = 1; PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; - if (buf->flags & BM_DIRTY) + if (buf->flags & BM_DIRTY || buf->cntxDirty) { bool smok; @@ -505,18 +453,18 @@ BufferAlloc(Relation reln, } else { - /* * BM_JUST_DIRTIED cleared by BufferReplace and shouldn't * be setted by anyone. - vadim 01/17/97 */ if (buf->flags & BM_JUST_DIRTIED) { - elog(FATAL, "BufferAlloc: content of block %u (%s) changed while flushing", + elog(STOP, "BufferAlloc: content of block %u (%s) changed while flushing", buf->tag.blockNum, buf->blind.relname); } else buf->flags &= ~BM_DIRTY; + buf->cntxDirty = false; } /* @@ -676,131 +624,15 @@ WriteBuffer(Buffer buffer) SpinAcquire(BufMgrLock); Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - SetBufferDirtiedByMe(buffer, bufHdr); + UnpinBuffer(bufHdr); SpinRelease(BufMgrLock); return TRUE; } -#ifdef NOT_USED -void -WriteBuffer_Debug(char *file, int line, Buffer buffer) -{ - WriteBuffer(buffer); - if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf; - - buf = &BufferDescriptors[buffer - 1]; - fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} - -#endif - -/* - * FlushBuffer -- like WriteBuffer, but write the page immediately, - * rather than just marking it dirty. On success return, the buffer will - * no longer be dirty. - * - * 'buffer' is known to be dirty/pinned, so there should not be a - * problem reading the BufferDesc members without the BufMgrLock - * (nobody should be able to change tags out from under us). - * - * If 'sync' is true, a synchronous write is wanted (wait for buffer to hit - * the disk). Otherwise it's sufficient to issue the kernel write call. - * - * Unpin buffer if 'release' is true. - */ -int -FlushBuffer(Buffer buffer, bool sync, bool release) -{ - BufferDesc *bufHdr; - Relation bufrel; - int status; - - if (BufferIsLocal(buffer)) - return FlushLocalBuffer(buffer, sync, release) ? STATUS_OK : STATUS_ERROR; - - if (BAD_BUFFER_ID(buffer)) - return STATUS_ERROR; - - Assert(PrivateRefCount[buffer - 1] > 0); /* else caller didn't pin */ - - bufHdr = &BufferDescriptors[buffer - 1]; - - bufrel = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - - Assert(bufrel != (Relation) NULL); - - SharedBufferChanged = true; - - /* To check if block content changed while flushing. - vadim 01/17/97 */ - SpinAcquire(BufMgrLock); - WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */ - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - SpinRelease(BufMgrLock); - - /* - * Grab a read lock on the buffer to ensure that no - * other backend changes its contents while we write it; - * see comments in BufferSync(). - */ - LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_SHARE); - - if (sync) - status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - else - status = smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - - LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_UNLOCK); - - /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ - RelationDecrementReferenceCount(bufrel); - - if (status == SM_FAIL) - { - elog(ERROR, "FlushBuffer: cannot flush block %u of the relation %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - return STATUS_ERROR; - } - BufferFlushCount++; - - SpinAcquire(BufMgrLock); - bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ - TerminateBufferIO(bufHdr); /* output IO finished */ - - /* - * If this buffer was marked by someone as DIRTY while we were - * flushing it out we must not clear shared DIRTY flag - vadim - * 01/17/97 - * - * ... but we can clear BufferDirtiedByMe anyway - tgl 3/31/00 - */ - if (bufHdr->flags & BM_JUST_DIRTIED) - { - elog(NOTICE, "FlushBuffer: content of block %u (%s) changed while flushing", - bufHdr->tag.blockNum, bufHdr->blind.relname); - } - else - bufHdr->flags &= ~BM_DIRTY; - ClearBufferDirtiedByMe(buffer, bufHdr); - if (release) - UnpinBuffer(bufHdr); - SpinRelease(BufMgrLock); - - return STATUS_OK; -} - /* * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer * when the operation is complete. @@ -822,8 +654,9 @@ WriteNoReleaseBuffer(Buffer buffer) SpinAcquire(BufMgrLock); Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - SetBufferDirtiedByMe(buffer, bufHdr); + SpinRelease(BufMgrLock); return STATUS_OK; @@ -876,307 +709,138 @@ ReleaseAndReadBuffer(Buffer buffer, } /* - * SetBufferDirtiedByMe -- mark a shared buffer as being dirtied by this xact - * - * This flag essentially remembers that we need to write and fsync this buffer - * before we can commit the transaction. The write might end up getting done - * by another backend, but we must do the fsync ourselves (else we could - * commit before the data actually reaches disk). We do not issue fsync - * instantly upon write; the storage manager keeps track of which files need - * to be fsync'd before commit can occur. A key aspect of this data structure - * is that we will be able to notify the storage manager that an fsync is - * needed even after another backend has done the physical write and replaced - * the buffer contents with something else! + * BufferSync -- Write all dirty buffers in the pool. * - * NB: we must be holding the bufmgr lock at entry, and the buffer must be - * pinned so that no other backend can take it away from us. + * This is called at checkpoint time and write out all dirty buffers. */ -static void -SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) -{ - BufferTag *tagLastDirtied = &BufferTagLastDirtied[buffer - 1]; - Relation reln; - int status; - - /* - * If the flag is already set, check to see whether the buffertag is - * the same. If not, some other backend already wrote the buffer data - * that we dirtied. We must tell the storage manager to make an fsync - * pending on that file before we can overwrite the old tag value. - */ - if (BufferDirtiedByMe[buffer - 1]) - { - if (RelFileNodeEquals(bufHdr->tag.rnode, tagLastDirtied->rnode) && - bufHdr->tag.blockNum == tagLastDirtied->blockNum) - return; /* Same tag already dirtied, so no work */ - -#ifndef OPTIMIZE_SINGLE - SpinRelease(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ - - reln = RelationNodeCacheGetRelation(tagLastDirtied->rnode); - - if (reln == (Relation) NULL) - { - status = smgrblindmarkdirty(DEFAULT_SMGR, - tagLastDirtied->rnode, - tagLastDirtied->blockNum); - } - else - { - Assert(RelFileNodeEquals(tagLastDirtied->rnode, reln->rd_node)); - status = smgrmarkdirty(DEFAULT_SMGR, reln, - tagLastDirtied->blockNum); - - /* - * drop relcache refcnt incremented by - * RelationNodeCacheGetRelation - */ - RelationDecrementReferenceCount(reln); - } - if (status == SM_FAIL) - { - elog(ERROR, "SetBufferDirtiedByMe: cannot mark %u for %s", - tagLastDirtied->blockNum, - BufferBlindLastDirtied[buffer - 1].relname); - } - -#ifndef OPTIMIZE_SINGLE - SpinAcquire(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ - - } - - *tagLastDirtied = bufHdr->tag; - BufferBlindLastDirtied[buffer - 1] = bufHdr->blind; - BufferDirtiedByMe[buffer - 1] = true; -} - -/* - * ClearBufferDirtiedByMe -- mark a shared buffer as no longer needing fsync - * - * If we write out a buffer ourselves, then the storage manager will set its - * needs-fsync flag for that file automatically, and so we can clear our own - * flag that says it needs to be done later. - * - * NB: we must be holding the bufmgr lock at entry. - */ -static void -ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) -{ - BufferTag *tagLastDirtied = &BufferTagLastDirtied[buffer - 1]; - - /* - * Do *not* clear the flag if it refers to some other buffertag than - * the data we just wrote. This is unlikely, but possible if some - * other backend replaced the buffer contents since we set our flag. - */ - if (RelFileNodeEquals(bufHdr->tag.rnode, tagLastDirtied->rnode) && - bufHdr->tag.blockNum == tagLastDirtied->blockNum) - BufferDirtiedByMe[buffer - 1] = false; -} - -/* - * BufferSync -- Flush all dirty buffers in the pool. - * - * This is called at transaction commit time. We find all buffers - * that have been dirtied by the current xact and flush them to disk. - * We do *not* flush dirty buffers that have been dirtied by other xacts. - * (This is a substantial change from pre-7.0 behavior.) - */ -static void +void BufferSync() { int i; BufferDesc *bufHdr; + Buffer buffer; int status; - Relation reln; - bool didwrite; + RelFileNode rnode; + XLogRecPtr recptr; + Relation reln = NULL; for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { - /* Ignore buffers that were not dirtied by me */ - if (!BufferDirtiedByMe[i]) - continue; SpinAcquire(BufMgrLock); - /* - * We only need to write if the buffer is still dirty and still - * contains the same disk page that it contained when we dirtied - * it. Otherwise, someone else has already written our changes for - * us, and we need only fsync. - * - * (NOTE: it's still possible to do an unnecessary write, if other - * xacts have written and then re-dirtied the page since our last - * change to it. But that should be pretty uncommon, and there's - * no easy way to detect it anyway.) - */ - reln = NULL; - didwrite = false; - if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) + if (!(bufHdr->flags & BM_VALID)) { - if (RelFileNodeEquals(bufHdr->tag.rnode, BufferTagLastDirtied[i].rnode) && - bufHdr->tag.blockNum == BufferTagLastDirtied[i].blockNum) - { - /* - * Try to find relation for buf. This could fail, if the - * rel has been flushed from the relcache since we dirtied - * the page. That should be uncommon, so paying the extra - * cost of a blind write when it happens seems OK. - */ - if (!InRecovery) - reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - - /* - * We have to pin buffer to keep anyone from stealing it - * from the buffer pool while we are flushing it or - * waiting in WaitIO. It's bad for GetFreeBuffer in - * BufferAlloc, but there is no other way to prevent - * writing into disk block data from some other buffer, - * getting smgr status of some other block and clearing - * BM_DIRTY of ... - VAdim 09/16/96 - */ - PinBuffer(bufHdr); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - UnpinBuffer(bufHdr); - if (bufHdr->flags & BM_IO_ERROR) - { - elog(ERROR, "BufferSync: write error %u for %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - } - } - else - { - - /* - * To check if block content changed while flushing - * (see below). - vadim 01/17/97 - */ - WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */ - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - SpinRelease(BufMgrLock); - - /* - * Grab a read lock on the buffer to ensure that no - * other backend changes its contents while we write it; - * otherwise we could write a non-self-consistent page - * image to disk, which'd be bad news if the other - * transaction aborts before writing its changes. - * - * Note that we still need the BM_JUST_DIRTIED mechanism - * in case someone dirties the buffer just before we - * grab this lock or just after we release it. - */ - LockBuffer(BufferDescriptorGetBuffer(bufHdr), - BUFFER_LOCK_SHARE); + SpinRelease(BufMgrLock); + continue; + } - /* - * If we didn't have the reldesc in our local cache, - * write this page out using the 'blind write' storage - * manager routine. If we did find it, use the - * standard interface. - */ - if (reln == (Relation) NULL) - { - status = smgrblindwrt(DEFAULT_SMGR, - bufHdr->tag.rnode, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data), - true); /* must fsync */ - } - else - { - status = smgrwrite(DEFAULT_SMGR, reln, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } + /* + * Pin buffer and ensure that no one reads it from disk + */ + PinBuffer(bufHdr); + /* Synchronize with BufferAlloc */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); - /* - * Release the per-buffer readlock, reacquire BufMgrLock. - */ - LockBuffer(BufferDescriptorGetBuffer(bufHdr), - BUFFER_LOCK_UNLOCK); + buffer = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; - SpinAcquire(BufMgrLock); + SpinRelease(BufMgrLock); - UnpinBuffer(bufHdr); - if (status == SM_FAIL) - { - bufHdr->flags |= BM_IO_ERROR; - elog(ERROR, "BufferSync: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - } - bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ - TerminateBufferIO(bufHdr); /* Sync IO finished */ - BufferFlushCount++; - didwrite = true; + /* + * Try to find relation for buffer + */ + reln = RelationNodeCacheGetRelation(rnode); - /* - * If this buffer was marked by someone as DIRTY while - * we were flushing it out we must not clear DIRTY - * flag - vadim 01/17/97 - * - * but it is OK to clear BufferDirtiedByMe - tgl 3/31/00 - */ - if (!(bufHdr->flags & BM_JUST_DIRTIED)) - bufHdr->flags &= ~BM_DIRTY; - } + /* + * Protect buffer content against concurrent update + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); - /* drop refcnt obtained by RelationNodeCacheGetRelation */ - if (reln != (Relation) NULL) - RelationDecrementReferenceCount(reln); - } - } + /* + * Force XLOG flush for buffer' LSN + */ + recptr = BufferGetLSN(bufHdr); + XLogFlush(recptr); /* - * If we did not write the buffer (because someone else did), we - * must still fsync the file containing it, to ensure that the - * write is down to disk before we commit. + * Now it's safe to write buffer to disk + * (if needed at all -:)) */ - if (!didwrite) + + SpinAcquire(BufMgrLock); + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); + + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) { -#ifndef OPTIMIZE_SINGLE + bufHdr->flags &= ~BM_JUST_DIRTIED; + StartBufferIO(bufHdr, false); /* output IO start */ + SpinRelease(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ - reln = RelationNodeCacheGetRelation(BufferTagLastDirtied[i].rnode); if (reln == (Relation) NULL) { - status = smgrblindmarkdirty(DEFAULT_SMGR, - BufferTagLastDirtied[i].rnode, - BufferTagLastDirtied[i].blockNum); + status = smgrblindwrt(DEFAULT_SMGR, + bufHdr->tag.rnode, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data), + true); /* must fsync */ } else { - status = smgrmarkdirty(DEFAULT_SMGR, reln, - BufferTagLastDirtied[i].blockNum); + status = smgrwrite(DEFAULT_SMGR, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } - /* - * drop relcache refcnt incremented by - * RelationNodeCacheGetRelation - */ - RelationDecrementReferenceCount(reln); + if (status == SM_FAIL) /* disk failure ?! */ + elog(STOP, "BufferSync: cannot write %u for %s", + bufHdr->tag.blockNum, bufHdr->blind.relname); + + /* + * Note that it's safe to change cntxDirty here because of + * we protect it from upper writers by share lock and from + * other bufmgr routines by BM_IO_IN_PROGRESS + */ + bufHdr->cntxDirty = false; + + /* + * Release the per-buffer readlock, reacquire BufMgrLock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + BufferFlushCount++; - } -#ifndef OPTIMIZE_SINGLE SpinAcquire(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ + + bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ + TerminateBufferIO(bufHdr); /* Sync IO finished */ + + /* + * If this buffer was marked by someone as DIRTY while + * we were flushing it out we must not clear DIRTY + * flag - vadim 01/17/97 + */ + if (!(bufHdr->flags & BM_JUST_DIRTIED)) + bufHdr->flags &= ~BM_DIRTY; } + else + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - BufferDirtiedByMe[i] = false; + UnpinBuffer(bufHdr); SpinRelease(BufMgrLock); + + /* drop refcnt obtained by RelationNodeCacheGetRelation */ + if (reln != (Relation) NULL) + { + RelationDecrementReferenceCount(reln); + reln = NULL; + } } -#ifndef XLOG - LocalBufferSync(); -#endif -} +} /* * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. @@ -1278,9 +942,6 @@ ResetBufferPool(bool isCommit) SpinRelease(BufMgrLock); } PrivateRefCount[i] = 0; - - if (!isCommit) - BufferDirtiedByMe[i] = false; } ResetLocalBufferPool(); @@ -1321,16 +982,29 @@ relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)", } /* ------------------------------------------------ - * FlushBufferPool - * - * flush all dirty blocks in buffer pool to disk + * FlushBufferPool * + * Flush all dirty blocks in buffer pool to disk + * at the checkpoint time * ------------------------------------------------ */ void FlushBufferPool(void) { BufferSync(); + smgrsync(); +} + +/* + * At the commit time we have to flush local buffer pool only + */ +void +BufmgrCommit(void) +{ + LocalBufferSync(); + /* + * All files created in current transaction will be fsync-ed + */ smgrcommit(); } @@ -1358,35 +1032,28 @@ BufferGetBlockNumber(Buffer buffer) * * Write out the buffer corresponding to 'bufHdr' * - * This routine used to flush the data to disk (ie, force immediate fsync) - * but that's no longer necessary because BufferSync is smarter than before. - * * BufMgrLock must be held at entry, and the buffer must be pinned. */ static int BufferReplace(BufferDesc *bufHdr) { Relation reln; + XLogRecPtr recptr; int status; - /* - * first try to find the reldesc in the cache, if no luck, don't - * bother to build the reldesc from scratch, just do a blind write. - */ - - reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - /* To check if block content changed while flushing. - vadim 01/17/97 */ bufHdr->flags &= ~BM_JUST_DIRTIED; SpinRelease(BufMgrLock); /* - * Grab a read lock on the buffer to ensure that no - * other backend changes its contents while we write it; - * see comments in BufferSync(). + * No need to lock buffer context - no one should be able to + * end ReadBuffer */ - LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_SHARE); + recptr = BufferGetLSN(bufHdr); + XLogFlush(recptr); + + reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode); if (reln != (Relation) NULL) { @@ -1401,25 +1068,15 @@ BufferReplace(BufferDesc *bufHdr) false); /* no fsync */ } - LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_UNLOCK); - - SpinAcquire(BufMgrLock); - /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); + SpinAcquire(BufMgrLock); + if (status == SM_FAIL) return FALSE; - /* - * If we had marked this buffer as needing to be fsync'd, we can - * forget about that, because it's now the storage manager's - * responsibility (but only if we called smgrwrite, not smgrblindwrt). - */ - if (reln != (Relation) NULL) - ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); - BufferFlushCount++; return TRUE; @@ -1438,7 +1095,8 @@ BlockNumber RelationGetNumberOfBlocks(Relation relation) { return ((relation->rd_myxactonly) ? relation->rd_nblocks : - smgrnblocks(DEFAULT_SMGR, relation)); + ((relation->rd_rel->relkind == RELKIND_VIEW) ? 0 : + smgrnblocks(DEFAULT_SMGR, relation))); } /* --------------------------------------------------------------------- @@ -1471,6 +1129,7 @@ DropRelationBuffers(Relation rel) if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) { bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; LocalRefCount[i] = 0; bufHdr->tag.rnode.relNode = InvalidOid; } @@ -1503,6 +1162,7 @@ recheck: } /* Now we can do what we came for */ bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; /* * Release any refcount we may have. @@ -1526,20 +1186,6 @@ recheck: */ BufTableDelete(bufHdr); } - - /* - * Also check to see if BufferDirtiedByMe info for this buffer - * refers to the target relation, and clear it if so. This is - * independent of whether the current contents of the buffer - * belong to the target relation! - * - * NOTE: we have no way to clear BufferDirtiedByMe info in other - * backends, but hopefully there are none with that bit set for - * this rel, since we hold exclusive lock on this rel. - */ - if (RelFileNodeEquals(rel->rd_node, - BufferTagLastDirtied[i - 1].rnode)) - BufferDirtiedByMe[i - 1] = false; } SpinRelease(BufMgrLock); @@ -1570,6 +1216,7 @@ DropRelFileNodeBuffers(RelFileNode rnode) if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) { bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; LocalRefCount[i] = 0; bufHdr->tag.rnode.relNode = InvalidOid; } @@ -1600,6 +1247,7 @@ recheck: } /* Now we can do what we came for */ bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; /* * Release any refcount we may have. @@ -1623,20 +1271,6 @@ recheck: */ BufTableDelete(bufHdr); } - - /* - * Also check to see if BufferDirtiedByMe info for this buffer - * refers to the target relation, and clear it if so. This is - * independent of whether the current contents of the buffer - * belong to the target relation! - * - * NOTE: we have no way to clear BufferDirtiedByMe info in other - * backends, but hopefully there are none with that bit set for - * this rel, since we hold exclusive lock on this rel. - */ - if (RelFileNodeEquals(rnode, - BufferTagLastDirtied[i - 1].rnode)) - BufferDirtiedByMe[i - 1] = false; } SpinRelease(BufMgrLock); @@ -1689,6 +1323,7 @@ recheck: } /* Now we can do what we came for */ bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; /* * The thing should be free, if caller has checked that no @@ -1700,17 +1335,6 @@ recheck: */ BufTableDelete(bufHdr); } - /* - * Also check to see if BufferDirtiedByMe info for this buffer - * refers to the target database, and clear it if so. This is - * independent of whether the current contents of the buffer - * belong to the target database! - * - * (Actually, this is probably unnecessary, since I shouldn't have - * ever dirtied pages of the target database, but...) - */ - if (BufferTagLastDirtied[i - 1].rnode.tblNode == dbid) - BufferDirtiedByMe[i - 1] = false; } SpinRelease(BufMgrLock); } @@ -1847,6 +1471,8 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) { int i; BufferDesc *bufHdr; + XLogRecPtr recptr; + int status; if (rel->rd_myxactonly) { @@ -1855,22 +1481,27 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) bufHdr = &LocalBufferDescriptors[i]; if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) { - if (bufHdr->flags & BM_DIRTY) + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) { - if (FlushBuffer(-i - 1, false, false) != STATUS_OK) + status = smgrwrite(DEFAULT_SMGR, rel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + if (status == SM_FAIL) { elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", RelationGetRelationName(rel), firstDelBlock, bufHdr->tag.blockNum); - return -1; + return(-1); } + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; } if (LocalRefCount[i] > 0) { elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is referenced (%ld)", RelationGetRelationName(rel), firstDelBlock, bufHdr->tag.blockNum, LocalRefCount[i]); - return -2; + return(-2); } if (bufHdr->tag.blockNum >= firstDelBlock) { @@ -1887,22 +1518,57 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) bufHdr = &BufferDescriptors[i]; if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) { - if (bufHdr->flags & BM_DIRTY) + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) { PinBuffer(bufHdr); + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); SpinRelease(BufMgrLock); - if (FlushBuffer(i + 1, false, false) != STATUS_OK) + + /* + * Force XLOG flush for buffer' LSN + */ + recptr = BufferGetLSN(bufHdr); + XLogFlush(recptr); + + /* + * Now it's safe to write buffer to disk + */ + + SpinAcquire(BufMgrLock); + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); + + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) { - SpinAcquire(BufMgrLock); - UnpinBuffer(bufHdr); + bufHdr->flags &= ~BM_JUST_DIRTIED; + StartBufferIO(bufHdr, false); /* output IO start */ + SpinRelease(BufMgrLock); - elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum, - PrivateRefCount[i], bufHdr->refcount); - return -1; + + status = smgrwrite(DEFAULT_SMGR, rel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + + if (status == SM_FAIL) /* disk failure ?! */ + elog(STOP, "FlushRelationBuffers: cannot write %u for %s", + bufHdr->tag.blockNum, bufHdr->blind.relname); + + BufferFlushCount++; + + SpinAcquire(BufMgrLock); + bufHdr->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(bufHdr); + Assert(!(bufHdr->flags & BM_JUST_DIRTIED)); + bufHdr->flags &= ~BM_DIRTY; + /* + * Note that it's safe to change cntxDirty here because + * of we protect it from upper writers by + * AccessExclusiveLock and from other bufmgr routines + * by BM_IO_IN_PROGRESS + */ + bufHdr->cntxDirty = false; } - SpinAcquire(BufMgrLock); UnpinBuffer(bufHdr); } if (!(bufHdr->flags & BM_FREE)) @@ -2341,6 +2007,9 @@ LockBuffer(Buffer buffer, int mode) } buf->w_lock = true; *buflock |= BL_W_LOCK; + + buf->cntxDirty = true; + if (*buflock & BL_RI_LOCK) { @@ -2458,11 +2127,11 @@ AbortBufferIO(void) Assert(buf->flags & BM_IO_IN_PROGRESS); SpinAcquire(BufMgrLock); if (IsForInput) - Assert(!(buf->flags & BM_DIRTY)); + Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty)); else { - Assert((buf->flags & BM_DIRTY) != 0); - if ((buf->flags & BM_IO_ERROR) != 0) + Assert(buf->flags & BM_DIRTY || buf->cntxDirty); + if (buf->flags & BM_IO_ERROR) { elog(NOTICE, "write error may be permanent: cannot write block %u for %s/%s", buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); @@ -2528,5 +2197,3 @@ MarkBufferForCleanup(Buffer buffer, void (*CleanupFunc)(Buffer)) SpinRelease(BufMgrLock); return; } - -#endif /* ! XLOG */ diff --git a/src/backend/storage/buffer/xlog_bufmgr.c b/src/backend/storage/buffer/xlog_bufmgr.c deleted file mode 100644 index fb02413f970..00000000000 --- a/src/backend/storage/buffer/xlog_bufmgr.c +++ /dev/null @@ -1,2202 +0,0 @@ -/*------------------------------------------------------------------------- - * - * xlog_bufmgr.c - * buffer manager interface routines - * - * Portions Copyright (c) 1996-2000, PostgreSQL, Inc - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.6 2000/11/30 01:39:07 tgl Exp $ - * - *------------------------------------------------------------------------- - */ -/* - * - * BufferAlloc() -- lookup a buffer in the buffer table. If - * it isn't there add it, but do not read data into memory. - * This is used when we are about to reinitialize the - * buffer so don't care what the current disk contents are. - * BufferAlloc() also pins the new buffer in memory. - * - * ReadBuffer() -- like BufferAlloc() but reads the data - * on a buffer cache miss. - * - * ReleaseBuffer() -- unpin the buffer - * - * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" - * but don't unpin. The disk IO is delayed until buffer - * replacement. - * - * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() - * - * BufferSync() -- flush all dirty buffers in the buffer pool. - * - * InitBufferPool() -- Init the buffer module. - * - * See other files: - * freelist.c -- chooses victim for buffer replacement - * buf_table.c -- manages the buffer lookup table - */ -#include "postgres.h" - -#include <sys/types.h> -#include <sys/file.h> -#include <math.h> -#include <signal.h> - -#include "executor/execdebug.h" -#include "miscadmin.h" -#include "storage/buf_internals.h" -#include "storage/bufmgr.h" -#include "storage/s_lock.h" -#include "storage/smgr.h" -#include "utils/relcache.h" - -#ifdef XLOG -#include "catalog/pg_database.h" -#endif - -#define BufferGetLSN(bufHdr) \ - (*((XLogRecPtr*)MAKE_PTR((bufHdr)->data))) - - -extern SPINLOCK BufMgrLock; -extern long int ReadBufferCount; -extern long int ReadLocalBufferCount; -extern long int BufferHitCount; -extern long int LocalBufferHitCount; -extern long int BufferFlushCount; -extern long int LocalBufferFlushCount; - -/* - * It's used to avoid disk writes for read-only transactions - * (i.e. when no one shared buffer was changed by transaction). - * We set it to true in WriteBuffer/WriteNoReleaseBuffer when - * marking shared buffer as dirty. We set it to false in xact.c - * after transaction is committed/aborted. - */ -bool SharedBufferChanged = false; - -static void WaitIO(BufferDesc *buf, SPINLOCK spinlock); -static void StartBufferIO(BufferDesc *buf, bool forInput); -static void TerminateBufferIO(BufferDesc *buf); -static void ContinueBufferIO(BufferDesc *buf, bool forInput); -extern void AbortBufferIO(void); - -/* - * Macro : BUFFER_IS_BROKEN - * Note that write error doesn't mean the buffer broken -*/ -#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY)) - -static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, - bool bufferLockHeld); -static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, - bool *foundPtr, bool bufferLockHeld); -static int BufferReplace(BufferDesc *bufHdr); -void PrintBufferDescs(void); - -/* --------------------------------------------------- - * RelationGetBufferWithBuffer - * see if the given buffer is what we want - * if yes, we don't need to bother the buffer manager - * --------------------------------------------------- - */ -Buffer -RelationGetBufferWithBuffer(Relation relation, - BlockNumber blockNumber, - Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsValid(buffer)) - { - if (!BufferIsLocal(buffer)) - { - bufHdr = &BufferDescriptors[buffer - 1]; - SpinAcquire(BufMgrLock); - if (bufHdr->tag.blockNum == blockNumber && - RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) - { - SpinRelease(BufMgrLock); - return buffer; - } - return ReadBufferWithBufferLock(relation, blockNumber, true); - } - else - { - bufHdr = &LocalBufferDescriptors[-buffer - 1]; - if (bufHdr->tag.blockNum == blockNumber && - RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) - return buffer; - } - } - return ReadBuffer(relation, blockNumber); -} - -/* - * ReadBuffer -- returns a buffer containing the requested - * block of the requested relation. If the blknum - * requested is P_NEW, extend the relation file and - * allocate a new block. - * - * Returns: the buffer number for the buffer containing - * the block read or NULL on an error. - * - * Assume when this function is called, that reln has been - * opened already. - */ - -#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG - * defined */ - -/* - * ReadBuffer - * - */ -Buffer -ReadBuffer(Relation reln, BlockNumber blockNum) -{ - return ReadBufferWithBufferLock(reln, blockNum, false); -} - -/* - * ReadBufferWithBufferLock -- does the work of - * ReadBuffer() but with the possibility that - * the buffer lock has already been held. this - * is yet another effort to reduce the number of - * semops in the system. - */ -static Buffer -ReadBufferWithBufferLock(Relation reln, - BlockNumber blockNum, - bool bufferLockHeld) -{ - BufferDesc *bufHdr; - int extend; /* extending the file by one block */ - int status; - bool found; - bool isLocalBuf; - - extend = (blockNum == P_NEW); - isLocalBuf = reln->rd_myxactonly; - - if (isLocalBuf) - { - ReadLocalBufferCount++; - bufHdr = LocalBufferAlloc(reln, blockNum, &found); - if (found) - LocalBufferHitCount++; - } - else - { - ReadBufferCount++; - - /* - * lookup the buffer. IO_IN_PROGRESS is set if the requested - * block is not currently in memory. - */ - bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld); - if (found) - BufferHitCount++; - } - - if (!bufHdr) - return InvalidBuffer; - - /* if it's already in the buffer pool, we're done */ - if (found) - { - - /* - * This happens when a bogus buffer was returned previously and is - * floating around in the buffer pool. A routine calling this - * would want this extended. - */ - if (extend) - { - /* new buffers are zero-filled */ - MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); - smgrextend(DEFAULT_SMGR, reln, - (char *) MAKE_PTR(bufHdr->data)); - } - return BufferDescriptorGetBuffer(bufHdr); - - } - - /* - * if we have gotten to this point, the reln pointer must be ok and - * the relation file must be open. - */ - if (extend) - { - /* new buffers are zero-filled */ - MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); - status = smgrextend(DEFAULT_SMGR, reln, - (char *) MAKE_PTR(bufHdr->data)); - } - else - { - status = smgrread(DEFAULT_SMGR, reln, blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } - - if (isLocalBuf) - return BufferDescriptorGetBuffer(bufHdr); - - /* lock buffer manager again to update IO IN PROGRESS */ - SpinAcquire(BufMgrLock); - - if (status == SM_FAIL) - { - /* IO Failed. cleanup the data structures and go home */ - - if (!BufTableDelete(bufHdr)) - { - SpinRelease(BufMgrLock); - elog(FATAL, "BufRead: buffer table broken after IO error\n"); - } - /* remember that BufferAlloc() pinned the buffer */ - UnpinBuffer(bufHdr); - - /* - * Have to reset the flag so that anyone waiting for the buffer - * can tell that the contents are invalid. - */ - bufHdr->flags |= BM_IO_ERROR; - bufHdr->flags &= ~BM_IO_IN_PROGRESS; - } - else - { - /* IO Succeeded. clear the flags, finish buffer update */ - - bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); - } - - /* If anyone was waiting for IO to complete, wake them up now */ - TerminateBufferIO(bufHdr); - - SpinRelease(BufMgrLock); - - if (status == SM_FAIL) - return InvalidBuffer; - - return BufferDescriptorGetBuffer(bufHdr); -} - -/* - * BufferAlloc -- Get a buffer from the buffer pool but dont - * read it. - * - * Returns: descriptor for buffer - * - * When this routine returns, the BufMgrLock is guaranteed NOT be held. - */ -static BufferDesc * -BufferAlloc(Relation reln, - BlockNumber blockNum, - bool *foundPtr, - bool bufferLockHeld) -{ - BufferDesc *buf, - *buf2; - BufferTag newTag; /* identity of requested block */ - bool inProgress; /* buffer undergoing IO */ - bool newblock = FALSE; - - /* create a new tag so we can lookup the buffer */ - /* assume that the relation is already open */ - if (blockNum == P_NEW) - { - newblock = TRUE; - blockNum = smgrnblocks(DEFAULT_SMGR, reln); - } - - INIT_BUFFERTAG(&newTag, reln, blockNum); - - if (!bufferLockHeld) - SpinAcquire(BufMgrLock); - - /* see if the block is in the buffer pool already */ - buf = BufTableLookup(&newTag); - if (buf != NULL) - { - - /* - * Found it. Now, (a) pin the buffer so no one steals it from the - * buffer pool, (b) check IO_IN_PROGRESS, someone may be faulting - * the buffer into the buffer pool. - */ - - PinBuffer(buf); - inProgress = (buf->flags & BM_IO_IN_PROGRESS); - - *foundPtr = TRUE; - if (inProgress) /* confirm end of IO */ - { - WaitIO(buf, BufMgrLock); - inProgress = (buf->flags & BM_IO_IN_PROGRESS); - } - if (BUFFER_IS_BROKEN(buf)) - { - - /* - * I couldn't understand the following old comment. If there's - * no IO for the buffer and the buffer is BROKEN,it should be - * read again. So start a new buffer IO here. - * - * wierd race condition: - * - * We were waiting for someone else to read the buffer. While we - * were waiting, the reader boof'd in some way, so the - * contents of the buffer are still invalid. By saying that - * we didn't find it, we can make the caller reinitialize the - * buffer. If two processes are waiting for this block, both - * will read the block. The second one to finish may - * overwrite any updates made by the first. (Assume higher - * level synchronization prevents this from happening). - * - * This is never going to happen, don't worry about it. - */ - *foundPtr = FALSE; - } -#ifdef BMTRACE - _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND); -#endif /* BMTRACE */ - - if (!(*foundPtr)) - StartBufferIO(buf, true); - SpinRelease(BufMgrLock); - - return buf; - } - - *foundPtr = FALSE; - - /* - * Didn't find it in the buffer pool. We'll have to initialize a new - * buffer. First, grab one from the free list. If it's dirty, flush - * it to disk. Remember to unlock BufMgr spinlock while doing the IOs. - */ - inProgress = FALSE; - for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;) - { - buf = GetFreeBuffer(); - - /* GetFreeBuffer will abort if it can't find a free buffer */ - Assert(buf); - - /* - * There should be exactly one pin on the buffer after it is - * allocated -- ours. If it had a pin it wouldn't have been on - * the free list. No one else could have pinned it between - * GetFreeBuffer and here because we have the BufMgrLock. - */ - Assert(buf->refcount == 0); - buf->refcount = 1; - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; - - if (buf->flags & BM_DIRTY || buf->cntxDirty) - { - bool smok; - - /* - * skip write error buffers - */ - if ((buf->flags & BM_IO_ERROR) != 0) - { - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - buf->refcount--; - buf = (BufferDesc *) NULL; - continue; - } - /* - * Set BM_IO_IN_PROGRESS to keep anyone from doing anything - * with the contents of the buffer while we write it out. We - * don't really care if they try to read it, but if they can - * complete a BufferAlloc on it they can then scribble into - * it, and we'd really like to avoid that while we are - * flushing the buffer. Setting this flag should block them - * in WaitIO until we're done. - */ - inProgress = TRUE; - - /* - * All code paths that acquire this lock pin the buffer first; - * since no one had it pinned (it just came off the free - * list), no one else can have this lock. - */ - StartBufferIO(buf, false); - - /* - * Write the buffer out, being careful to release BufMgrLock - * before starting the I/O. - */ - smok = BufferReplace(buf); - - if (smok == FALSE) - { - elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s", - buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); - inProgress = FALSE; - buf->flags |= BM_IO_ERROR; - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - Assert(buf->refcount > 0); - buf->refcount--; - if (buf->refcount == 0) - { - AddBufferToFreelist(buf); - buf->flags |= BM_FREE; - } - buf = (BufferDesc *) NULL; - } - else - { - /* - * BM_JUST_DIRTIED cleared by BufferReplace and shouldn't - * be setted by anyone. - vadim 01/17/97 - */ - if (buf->flags & BM_JUST_DIRTIED) - { - elog(STOP, "BufferAlloc: content of block %u (%s) changed while flushing", - buf->tag.blockNum, buf->blind.relname); - } - else - buf->flags &= ~BM_DIRTY; - buf->cntxDirty = false; - } - - /* - * Somebody could have pinned the buffer while we were doing - * the I/O and had given up the BufMgrLock (though they would - * be waiting for us to clear the BM_IO_IN_PROGRESS flag). - * That's why this is a loop -- if so, we need to clear the - * I/O flags, remove our pin and start all over again. - * - * People may be making buffers free at any time, so there's no - * reason to think that we have an immediate disaster on our - * hands. - */ - if (buf && buf->refcount > 1) - { - inProgress = FALSE; - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - buf->refcount--; - buf = (BufferDesc *) NULL; - } - - /* - * Somebody could have allocated another buffer for the same - * block we are about to read in. (While we flush out the - * dirty buffer, we don't hold the lock and someone could have - * allocated another buffer for the same block. The problem is - * we haven't gotten around to insert the new tag into the - * buffer table. So we need to check here. -ay 3/95 - */ - buf2 = BufTableLookup(&newTag); - if (buf2 != NULL) - { - - /* - * Found it. Someone has already done what we're about to - * do. We'll just handle this as if it were found in the - * buffer pool in the first place. - */ - if (buf != NULL) - { - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - /* give up the buffer since we don't need it any more */ - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - Assert(buf->refcount > 0); - buf->refcount--; - if (buf->refcount == 0) - { - AddBufferToFreelist(buf); - buf->flags |= BM_FREE; - } - } - - PinBuffer(buf2); - inProgress = (buf2->flags & BM_IO_IN_PROGRESS); - - *foundPtr = TRUE; - if (inProgress) - { - WaitIO(buf2, BufMgrLock); - inProgress = (buf2->flags & BM_IO_IN_PROGRESS); - } - if (BUFFER_IS_BROKEN(buf2)) - *foundPtr = FALSE; - - if (!(*foundPtr)) - StartBufferIO(buf2, true); - SpinRelease(BufMgrLock); - - return buf2; - } - } - } - - /* - * At this point we should have the sole pin on a non-dirty buffer and - * we may or may not already have the BM_IO_IN_PROGRESS flag set. - */ - - /* - * Change the name of the buffer in the lookup table: - * - * Need to update the lookup table before the read starts. If someone - * comes along looking for the buffer while we are reading it in, we - * don't want them to allocate a new buffer. For the same reason, we - * didn't want to erase the buf table entry for the buffer we were - * writing back until now, either. - */ - - if (!BufTableDelete(buf)) - { - SpinRelease(BufMgrLock); - elog(FATAL, "buffer wasn't in the buffer table\n"); - } - - /* record the database name and relation name for this buffer */ - strcpy(buf->blind.dbname, (DatabaseName) ? DatabaseName : "Recovery"); - strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln)); - - INIT_BUFFERTAG(&(buf->tag), reln, blockNum); - if (!BufTableInsert(buf)) - { - SpinRelease(BufMgrLock); - elog(FATAL, "Buffer in lookup table twice \n"); - } - - /* - * Buffer contents are currently invalid. Have to mark IO IN PROGRESS - * so no one fiddles with them until the read completes. If this - * routine has been called simply to allocate a buffer, no io will be - * attempted, so the flag isnt set. - */ - if (!inProgress) - StartBufferIO(buf, true); - else - ContinueBufferIO(buf, true); - -#ifdef BMTRACE - _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND); -#endif /* BMTRACE */ - - SpinRelease(BufMgrLock); - - return buf; -} - -/* - * WriteBuffer - * - * Marks buffer contents as dirty (actual write happens later). - * - * Assume that buffer is pinned. Assume that reln is - * valid. - * - * Side Effects: - * Pin count is decremented. - */ - -#undef WriteBuffer - -int -WriteBuffer(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - return WriteLocalBuffer(buffer, TRUE); - - if (BAD_BUFFER_ID(buffer)) - return FALSE; - - bufHdr = &BufferDescriptors[buffer - 1]; - - SharedBufferChanged = true; - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - - UnpinBuffer(bufHdr); - SpinRelease(BufMgrLock); - - return TRUE; -} - -/* - * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer - * when the operation is complete. - */ -int -WriteNoReleaseBuffer(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - return WriteLocalBuffer(buffer, FALSE); - - if (BAD_BUFFER_ID(buffer)) - return STATUS_ERROR; - - bufHdr = &BufferDescriptors[buffer - 1]; - - SharedBufferChanged = true; - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - - SpinRelease(BufMgrLock); - - return STATUS_OK; -} - - -#undef ReleaseAndReadBuffer -/* - * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() - * so that only one semop needs to be called. - * - */ -Buffer -ReleaseAndReadBuffer(Buffer buffer, - Relation relation, - BlockNumber blockNum) -{ - BufferDesc *bufHdr; - Buffer retbuf; - - if (BufferIsLocal(buffer)) - { - Assert(LocalRefCount[-buffer - 1] > 0); - LocalRefCount[-buffer - 1]--; - } - else - { - if (BufferIsValid(buffer)) - { - bufHdr = &BufferDescriptors[buffer - 1]; - Assert(PrivateRefCount[buffer - 1] > 0); - PrivateRefCount[buffer - 1]--; - if (PrivateRefCount[buffer - 1] == 0) - { - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->refcount--; - if (bufHdr->refcount == 0) - { - AddBufferToFreelist(bufHdr); - bufHdr->flags |= BM_FREE; - } - retbuf = ReadBufferWithBufferLock(relation, blockNum, true); - return retbuf; - } - } - } - - return ReadBuffer(relation, blockNum); -} - -/* - * BufferSync -- Write all dirty buffers in the pool. - * - * This is called at checkpoint time and write out all dirty buffers. - */ -void -BufferSync() -{ - int i; - BufferDesc *bufHdr; - Buffer buffer; - int status; - RelFileNode rnode; - XLogRecPtr recptr; - Relation reln = NULL; - - for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) - { - - SpinAcquire(BufMgrLock); - - if (!(bufHdr->flags & BM_VALID)) - { - SpinRelease(BufMgrLock); - continue; - } - - /* - * Pin buffer and ensure that no one reads it from disk - */ - PinBuffer(bufHdr); - /* Synchronize with BufferAlloc */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - - buffer = BufferDescriptorGetBuffer(bufHdr); - rnode = bufHdr->tag.rnode; - - SpinRelease(BufMgrLock); - - /* - * Try to find relation for buffer - */ - reln = RelationNodeCacheGetRelation(rnode); - - /* - * Protect buffer content against concurrent update - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - - /* - * Force XLOG flush for buffer' LSN - */ - recptr = BufferGetLSN(bufHdr); - XLogFlush(recptr); - - /* - * Now it's safe to write buffer to disk - * (if needed at all -:)) - */ - - SpinAcquire(BufMgrLock); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - SpinRelease(BufMgrLock); - - if (reln == (Relation) NULL) - { - status = smgrblindwrt(DEFAULT_SMGR, - bufHdr->tag.rnode, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data), - true); /* must fsync */ - } - else - { - status = smgrwrite(DEFAULT_SMGR, reln, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } - - if (status == SM_FAIL) /* disk failure ?! */ - elog(STOP, "BufferSync: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - - /* - * Note that it's safe to change cntxDirty here because of - * we protect it from upper writers by share lock and from - * other bufmgr routines by BM_IO_IN_PROGRESS - */ - bufHdr->cntxDirty = false; - - /* - * Release the per-buffer readlock, reacquire BufMgrLock. - */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - BufferFlushCount++; - - SpinAcquire(BufMgrLock); - - bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ - TerminateBufferIO(bufHdr); /* Sync IO finished */ - - /* - * If this buffer was marked by someone as DIRTY while - * we were flushing it out we must not clear DIRTY - * flag - vadim 01/17/97 - */ - if (!(bufHdr->flags & BM_JUST_DIRTIED)) - bufHdr->flags &= ~BM_DIRTY; - } - else - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - UnpinBuffer(bufHdr); - - SpinRelease(BufMgrLock); - - /* drop refcnt obtained by RelationNodeCacheGetRelation */ - if (reln != (Relation) NULL) - { - RelationDecrementReferenceCount(reln); - reln = NULL; - } - } - -} - -/* - * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. - * - * Should be entered with buffer manager spinlock held; releases it before - * waiting and re-acquires it afterwards. - */ -static void -WaitIO(BufferDesc *buf, SPINLOCK spinlock) -{ - - /* - * Changed to wait until there's no IO - Inoue 01/13/2000 - */ - while ((buf->flags & BM_IO_IN_PROGRESS) != 0) - { - SpinRelease(spinlock); - S_LOCK(&(buf->io_in_progress_lock)); - S_UNLOCK(&(buf->io_in_progress_lock)); - SpinAcquire(spinlock); - } -} - - -long NDirectFileRead; /* some I/O's are direct file access. - * bypass bufmgr */ -long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ - -void -PrintBufferUsage(FILE *statfp) -{ - float hitrate; - float localhitrate; - - if (ReadBufferCount == 0) - hitrate = 0.0; - else - hitrate = (float) BufferHitCount *100.0 / ReadBufferCount; - - if (ReadLocalBufferCount == 0) - localhitrate = 0.0; - else - localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount; - - fprintf(statfp, "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n", - ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate); - fprintf(statfp, "!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n", - ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate); - fprintf(statfp, "!\tDirect blocks: %10ld read, %10ld written\n", - NDirectFileRead, NDirectFileWrite); -} - -void -ResetBufferUsage() -{ - BufferHitCount = 0; - ReadBufferCount = 0; - BufferFlushCount = 0; - LocalBufferHitCount = 0; - ReadLocalBufferCount = 0; - LocalBufferFlushCount = 0; - NDirectFileRead = 0; - NDirectFileWrite = 0; -} - -/* ---------------------------------------------- - * ResetBufferPool - * - * This routine is supposed to be called when a transaction aborts. - * it will release all the buffer pins held by the transaction. - * Currently, we also call it during commit if BufferPoolCheckLeak - * detected a problem --- in that case, isCommit is TRUE, and we - * only clean up buffer pin counts. - * - * During abort, we also forget any pending fsync requests. Dirtied buffers - * will still get written, eventually, but there will be no fsync for them. - * - * ---------------------------------------------- - */ -void -ResetBufferPool(bool isCommit) -{ - int i; - - for (i = 0; i < NBuffers; i++) - { - if (PrivateRefCount[i] != 0) - { - BufferDesc *buf = &BufferDescriptors[i]; - - SpinAcquire(BufMgrLock); - Assert(buf->refcount > 0); - buf->refcount--; - if (buf->refcount == 0) - { - AddBufferToFreelist(buf); - buf->flags |= BM_FREE; - } - SpinRelease(BufMgrLock); - } - PrivateRefCount[i] = 0; - } - - ResetLocalBufferPool(); - - if (!isCommit) - smgrabort(); -} - -/* ----------------------------------------------- - * BufferPoolCheckLeak - * - * check if there is buffer leak - * - * ----------------------------------------------- - */ -int -BufferPoolCheckLeak() -{ - int i; - int result = 0; - - for (i = 1; i <= NBuffers; i++) - { - if (PrivateRefCount[i - 1] != 0) - { - BufferDesc *buf = &(BufferDescriptors[i - 1]); - - elog(NOTICE, - "Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \ -relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)", - i - 1, buf->freeNext, buf->freePrev, - buf->blind.relname, buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i - 1]); - result = 1; - } - } - return result; -} - -/* ------------------------------------------------ - * FlushBufferPool - * - * Flush all dirty blocks in buffer pool to disk - * at the checkpoint time - * ------------------------------------------------ - */ -void -FlushBufferPool(void) -{ - BufferSync(); - smgrsync(); -} - -/* - * At the commit time we have to flush local buffer pool only - */ -void -BufmgrCommit(void) -{ - LocalBufferSync(); - /* - * All files created in current transaction will be fsync-ed - */ - smgrcommit(); -} - -/* - * BufferGetBlockNumber - * Returns the block number associated with a buffer. - * - * Note: - * Assumes that the buffer is valid. - */ -BlockNumber -BufferGetBlockNumber(Buffer buffer) -{ - Assert(BufferIsValid(buffer)); - - /* XXX should be a critical section */ - if (BufferIsLocal(buffer)) - return LocalBufferDescriptors[-buffer - 1].tag.blockNum; - else - return BufferDescriptors[buffer - 1].tag.blockNum; -} - -/* - * BufferReplace - * - * Write out the buffer corresponding to 'bufHdr' - * - * BufMgrLock must be held at entry, and the buffer must be pinned. - */ -static int -BufferReplace(BufferDesc *bufHdr) -{ - Relation reln; - XLogRecPtr recptr; - int status; - - /* To check if block content changed while flushing. - vadim 01/17/97 */ - bufHdr->flags &= ~BM_JUST_DIRTIED; - - SpinRelease(BufMgrLock); - - /* - * No need to lock buffer context - no one should be able to - * end ReadBuffer - */ - recptr = BufferGetLSN(bufHdr); - XLogFlush(recptr); - - reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - - if (reln != (Relation) NULL) - { - status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } - else - { - status = smgrblindwrt(DEFAULT_SMGR, bufHdr->tag.rnode, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data), - false); /* no fsync */ - } - - /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ - if (reln != (Relation) NULL) - RelationDecrementReferenceCount(reln); - - SpinAcquire(BufMgrLock); - - if (status == SM_FAIL) - return FALSE; - - BufferFlushCount++; - - return TRUE; -} - -/* - * RelationGetNumberOfBlocks - * Returns the buffer descriptor associated with a page in a relation. - * - * Note: - * XXX may fail for huge relations. - * XXX should be elsewhere. - * XXX maybe should be hidden - */ -BlockNumber -RelationGetNumberOfBlocks(Relation relation) -{ - return ((relation->rd_myxactonly) ? relation->rd_nblocks : - ((relation->rd_rel->relkind == RELKIND_VIEW) ? 0 : - smgrnblocks(DEFAULT_SMGR, relation))); -} - -/* --------------------------------------------------------------------- - * DropRelationBuffers - * - * This function removes all the buffered pages for a relation - * from the buffer pool. Dirty pages are simply dropped, without - * bothering to write them out first. This is NOT rollback-able, - * and so should be used only with extreme caution! - * - * We assume that the caller holds an exclusive lock on the relation, - * which should assure that no new buffers will be acquired for the rel - * meanwhile. - * - * XXX currently it sequentially searches the buffer pool, should be - * changed to more clever ways of searching. - * -------------------------------------------------------------------- - */ -void -DropRelationBuffers(Relation rel) -{ - int i; - BufferDesc *bufHdr; - - if (rel->rd_myxactonly) - { - for (i = 0; i < NLocBuffer; i++) - { - bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - LocalRefCount[i] = 0; - bufHdr->tag.rnode.relNode = InvalidOid; - } - } - return; - } - - SpinAcquire(BufMgrLock); - for (i = 1; i <= NBuffers; i++) - { - bufHdr = &BufferDescriptors[i - 1]; -recheck: - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - - /* - * If there is I/O in progress, better wait till it's done; - * don't want to delete the relation out from under someone - * who's just trying to flush the buffer! - */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - - /* - * By now, the buffer very possibly belongs to some other - * rel, so check again before proceeding. - */ - goto recheck; - } - /* Now we can do what we came for */ - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - - /* - * Release any refcount we may have. - * - * This is very probably dead code, and if it isn't then it's - * probably wrong. I added the Assert to find out --- tgl - * 11/99. - */ - if (!(bufHdr->flags & BM_FREE)) - { - /* Assert checks that buffer will actually get freed! */ - Assert(PrivateRefCount[i - 1] == 1 && - bufHdr->refcount == 1); - /* ReleaseBuffer expects we do not hold the lock at entry */ - SpinRelease(BufMgrLock); - ReleaseBuffer(i); - SpinAcquire(BufMgrLock); - } - /* - * And mark the buffer as no longer occupied by this rel. - */ - BufTableDelete(bufHdr); - } - } - - SpinRelease(BufMgrLock); -} - -/* --------------------------------------------------------------------- - * DropRelFileNodeBuffers - * - * This is the same as DropRelationBuffers, except that the target - * relation is specified by RelFileNode. - * - * This is NOT rollback-able. One legitimate use is to clear the - * buffer cache of buffers for a relation that is being deleted - * during transaction abort. - * -------------------------------------------------------------------- - */ -void -DropRelFileNodeBuffers(RelFileNode rnode) -{ - int i; - BufferDesc *bufHdr; - - /* We have to search both local and shared buffers... */ - - for (i = 0; i < NLocBuffer; i++) - { - bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) - { - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - LocalRefCount[i] = 0; - bufHdr->tag.rnode.relNode = InvalidOid; - } - } - - SpinAcquire(BufMgrLock); - for (i = 1; i <= NBuffers; i++) - { - bufHdr = &BufferDescriptors[i - 1]; -recheck: - if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) - { - - /* - * If there is I/O in progress, better wait till it's done; - * don't want to delete the relation out from under someone - * who's just trying to flush the buffer! - */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - - /* - * By now, the buffer very possibly belongs to some other - * rel, so check again before proceeding. - */ - goto recheck; - } - /* Now we can do what we came for */ - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - - /* - * Release any refcount we may have. - * - * This is very probably dead code, and if it isn't then it's - * probably wrong. I added the Assert to find out --- tgl - * 11/99. - */ - if (!(bufHdr->flags & BM_FREE)) - { - /* Assert checks that buffer will actually get freed! */ - Assert(PrivateRefCount[i - 1] == 1 && - bufHdr->refcount == 1); - /* ReleaseBuffer expects we do not hold the lock at entry */ - SpinRelease(BufMgrLock); - ReleaseBuffer(i); - SpinAcquire(BufMgrLock); - } - /* - * And mark the buffer as no longer occupied by this rel. - */ - BufTableDelete(bufHdr); - } - } - - SpinRelease(BufMgrLock); -} - -/* --------------------------------------------------------------------- - * DropBuffers - * - * This function removes all the buffers in the buffer cache for a - * particular database. Dirty pages are simply dropped, without - * bothering to write them out first. This is used when we destroy a - * database, to avoid trying to flush data to disk when the directory - * tree no longer exists. Implementation is pretty similar to - * DropRelationBuffers() which is for destroying just one relation. - * -------------------------------------------------------------------- - */ -void -DropBuffers(Oid dbid) -{ - int i; - BufferDesc *bufHdr; - - SpinAcquire(BufMgrLock); - for (i = 1; i <= NBuffers; i++) - { - bufHdr = &BufferDescriptors[i - 1]; -recheck: - /* - * We know that currently database OID is tblNode but - * this probably will be changed in future and this - * func will be used to drop tablespace buffers. - */ - if (bufHdr->tag.rnode.tblNode == dbid) - { - - /* - * If there is I/O in progress, better wait till it's done; - * don't want to delete the database out from under someone - * who's just trying to flush the buffer! - */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - - /* - * By now, the buffer very possibly belongs to some other - * DB, so check again before proceeding. - */ - goto recheck; - } - /* Now we can do what we came for */ - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - - /* - * The thing should be free, if caller has checked that no - * backends are running in that database. - */ - Assert(bufHdr->flags & BM_FREE); - /* - * And mark the buffer as no longer occupied by this page. - */ - BufTableDelete(bufHdr); - } - } - SpinRelease(BufMgrLock); -} - -/* ----------------------------------------------------------------- - * PrintBufferDescs - * - * this function prints all the buffer descriptors, for debugging - * use only. - * ----------------------------------------------------------------- - */ -void -PrintBufferDescs() -{ - int i; - BufferDesc *buf = BufferDescriptors; - - if (IsUnderPostmaster) - { - SpinAcquire(BufMgrLock); - for (i = 0; i < NBuffers; ++i, ++buf) - { - elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ -blockNum=%d, flags=0x%x, refcount=%d %ld)", - i, buf->freeNext, buf->freePrev, - buf->blind.relname, buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i]); - } - SpinRelease(BufMgrLock); - } - else - { - /* interactive backend */ - for (i = 0; i < NBuffers; ++i, ++buf) - { - printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", - i, buf->blind.relname, buf->tag.blockNum, - buf->flags, buf->refcount, PrivateRefCount[i]); - } - } -} - -void -PrintPinnedBufs() -{ - int i; - BufferDesc *buf = BufferDescriptors; - - SpinAcquire(BufMgrLock); - for (i = 0; i < NBuffers; ++i, ++buf) - { - if (PrivateRefCount[i] > 0) - elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ -blockNum=%d, flags=0x%x, refcount=%d %ld)\n", - i, buf->freeNext, buf->freePrev, buf->blind.relname, - buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i]); - } - SpinRelease(BufMgrLock); -} - -/* - * BufferPoolBlowaway - * - * this routine is solely for the purpose of experiments -- sometimes - * you may want to blowaway whatever is left from the past in buffer - * pool and start measuring some performance with a clean empty buffer - * pool. - */ -#ifdef NOT_USED -void -BufferPoolBlowaway() -{ - int i; - - BufferSync(); - for (i = 1; i <= NBuffers; i++) - { - if (BufferIsValid(i)) - { - while (BufferIsValid(i)) - ReleaseBuffer(i); - } - BufTableDelete(&BufferDescriptors[i - 1]); - } -} - -#endif - -/* --------------------------------------------------------------------- - * FlushRelationBuffers - * - * This function writes all dirty pages of a relation out to disk. - * Furthermore, pages that have blocknumber >= firstDelBlock are - * actually removed from the buffer pool. An error code is returned - * if we fail to dump a dirty buffer or if we find one of - * the target pages is pinned into the cache. - * - * This is called by DROP TABLE to clear buffers for the relation - * from the buffer pool. Note that we must write dirty buffers, - * rather than just dropping the changes, because our transaction - * might abort later on; we want to roll back safely in that case. - * - * This is also called by VACUUM before truncating the relation to the - * given number of blocks. It might seem unnecessary for VACUUM to - * write dirty pages before firstDelBlock, since VACUUM should already - * have committed its changes. However, it is possible for there still - * to be dirty pages: if some page had unwritten on-row tuple status - * updates from a prior transaction, and VACUUM had no additional - * changes to make to that page, then VACUUM won't have written it. - * This is harmless in most cases but will break pg_upgrade, which - * relies on VACUUM to ensure that *all* tuples have correct on-row - * status. So, we check and flush all dirty pages of the rel - * regardless of block number. - * - * In all cases, the caller should be holding AccessExclusiveLock on - * the target relation to ensure that no other backend is busy reading - * more blocks of the relation (or might do so before we commit). - * - * Formerly, we considered it an error condition if we found dirty - * buffers here. However, since BufferSync no longer forces out all - * dirty buffers at every xact commit, it's possible for dirty buffers - * to still be present in the cache due to failure of an earlier - * transaction. So, must flush dirty buffers without complaint. - * - * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED - * - * XXX currently it sequentially searches the buffer pool, should be - * changed to more clever ways of searching. - * -------------------------------------------------------------------- - */ -int -FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) -{ - int i; - BufferDesc *bufHdr; - XLogRecPtr recptr; - int status; - - if (rel->rd_myxactonly) - { - for (i = 0; i < NLocBuffer; i++) - { - bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - status = smgrwrite(DEFAULT_SMGR, rel, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - if (status == SM_FAIL) - { - elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum); - return(-1); - } - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - } - if (LocalRefCount[i] > 0) - { - elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is referenced (%ld)", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum, LocalRefCount[i]); - return(-2); - } - if (bufHdr->tag.blockNum >= firstDelBlock) - { - bufHdr->tag.rnode.relNode = InvalidOid; - } - } - } - return 0; - } - - SpinAcquire(BufMgrLock); - for (i = 0; i < NBuffers; i++) - { - bufHdr = &BufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - PinBuffer(bufHdr); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - SpinRelease(BufMgrLock); - - /* - * Force XLOG flush for buffer' LSN - */ - recptr = BufferGetLSN(bufHdr); - XLogFlush(recptr); - - /* - * Now it's safe to write buffer to disk - */ - - SpinAcquire(BufMgrLock); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - SpinRelease(BufMgrLock); - - status = smgrwrite(DEFAULT_SMGR, rel, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - - if (status == SM_FAIL) /* disk failure ?! */ - elog(STOP, "FlushRelationBuffers: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - - BufferFlushCount++; - - SpinAcquire(BufMgrLock); - bufHdr->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(bufHdr); - Assert(!(bufHdr->flags & BM_JUST_DIRTIED)); - bufHdr->flags &= ~BM_DIRTY; - /* - * Note that it's safe to change cntxDirty here because - * of we protect it from upper writers by - * AccessExclusiveLock and from other bufmgr routines - * by BM_IO_IN_PROGRESS - */ - bufHdr->cntxDirty = false; - } - UnpinBuffer(bufHdr); - } - if (!(bufHdr->flags & BM_FREE)) - { - SpinRelease(BufMgrLock); - elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum, - PrivateRefCount[i], bufHdr->refcount); - return -2; - } - if (bufHdr->tag.blockNum >= firstDelBlock) - { - BufTableDelete(bufHdr); - } - } - } - SpinRelease(BufMgrLock); - return 0; -} - -#undef ReleaseBuffer - -/* - * ReleaseBuffer -- remove the pin on a buffer without - * marking it dirty. - * - */ -int -ReleaseBuffer(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - { - Assert(LocalRefCount[-buffer - 1] > 0); - LocalRefCount[-buffer - 1]--; - return STATUS_OK; - } - - if (BAD_BUFFER_ID(buffer)) - return STATUS_ERROR; - - bufHdr = &BufferDescriptors[buffer - 1]; - - Assert(PrivateRefCount[buffer - 1] > 0); - PrivateRefCount[buffer - 1]--; - if (PrivateRefCount[buffer - 1] == 0) - { - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->refcount--; - if (bufHdr->refcount == 0) - { - AddBufferToFreelist(bufHdr); - bufHdr->flags |= BM_FREE; - } - SpinRelease(BufMgrLock); - } - - return STATUS_OK; -} - -#ifdef NOT_USED -void -IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) -{ - IncrBufferRefCount(buffer); - if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} - -#endif - -#ifdef NOT_USED -void -ReleaseBuffer_Debug(char *file, int line, Buffer buffer) -{ - ReleaseBuffer(buffer); - if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} - -#endif - -#ifdef NOT_USED -int -ReleaseAndReadBuffer_Debug(char *file, - int line, - Buffer buffer, - Relation relation, - BlockNumber blockNum) -{ - bool bufferValid; - Buffer b; - - bufferValid = BufferIsValid(buffer); - b = ReleaseAndReadBuffer(buffer, relation, blockNum); - if (ShowPinTrace && bufferValid && BufferIsLocal(buffer) - && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } - if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[b - 1]; - - fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - b, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[b - 1], file, line); - } - return b; -} - -#endif - -#ifdef BMTRACE - -/* - * trace allocations and deallocations in a circular buffer in - * shared memory. check the buffer before doing the allocation, - * and die if there's anything fishy. - */ - -_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType) -{ - long start, - cur; - bmtrace *tb; - - start = *CurTraceBuf; - - if (start > 0) - cur = start - 1; - else - cur = BMT_LIMIT - 1; - - for (;;) - { - tb = &TraceBuf[cur]; - if (tb->bmt_op != BMT_NOTUSED) - { - if (tb->bmt_buf == bufNo) - { - if ((tb->bmt_op == BMT_DEALLOC) - || (tb->bmt_dbid == dbId && tb->bmt_relid == relId - && tb->bmt_blkno == blkNo)) - goto okay; - - /* die holding the buffer lock */ - _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur); - } - } - - if (cur == start) - goto okay; - - if (cur == 0) - cur = BMT_LIMIT - 1; - else - cur--; - } - -okay: - tb = &TraceBuf[start]; - tb->bmt_pid = MyProcPid; - tb->bmt_buf = bufNo; - tb->bmt_dbid = dbId; - tb->bmt_relid = relId; - tb->bmt_blkno = blkNo; - tb->bmt_op = allocType; - - *CurTraceBuf = (start + 1) % BMT_LIMIT; -} - -_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, - int allocType, long start, long cur) -{ - FILE *fp; - bmtrace *tb; - int i; - - tb = &TraceBuf[cur]; - - if ((fp = AllocateFile("/tmp/death_notice", "w")) == NULL) - elog(FATAL, "buffer alloc trace error and can't open log file"); - - fprintf(fp, "buffer alloc trace detected the following error:\n\n"); - fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n", - bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"), - (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation")); - - fprintf(fp, "the trace buffer contains:\n"); - - i = start; - for (;;) - { - tb = &TraceBuf[i]; - if (tb->bmt_op != BMT_NOTUSED) - { - fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%u,%d> ", - i, (i == cur ? " ---> " : "\t"), - tb->bmt_pid, tb->bmt_buf, - tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno); - - switch (tb->bmt_op) - { - case BMT_ALLOCFND: - fprintf(fp, "allocate (found)\n"); - break; - - case BMT_ALLOCNOTFND: - fprintf(fp, "allocate (not found)\n"); - break; - - case BMT_DEALLOC: - fprintf(fp, "deallocate\n"); - break; - - default: - fprintf(fp, "unknown op type %d\n", tb->bmt_op); - break; - } - } - - i = (i + 1) % BMT_LIMIT; - if (i == start) - break; - } - - fprintf(fp, "\noperation causing error:\n"); - fprintf(fp, "\tpid %d buf %d for <%d,%u,%d> ", - getpid(), bufNo, dbId, relId, blkNo); - - switch (allocType) - { - case BMT_ALLOCFND: - fprintf(fp, "allocate (found)\n"); - break; - - case BMT_ALLOCNOTFND: - fprintf(fp, "allocate (not found)\n"); - break; - - case BMT_DEALLOC: - fprintf(fp, "deallocate\n"); - break; - - default: - fprintf(fp, "unknown op type %d\n", allocType); - break; - } - - FreeFile(fp); - - kill(getpid(), SIGILL); -} - -#endif /* BMTRACE */ - -/* - * SetBufferCommitInfoNeedsSave - * - * Mark a buffer dirty when we have updated tuple commit-status bits in it. - * - * This is similar to WriteNoReleaseBuffer, except that we do not set - * SharedBufferChanged or BufferDirtiedByMe, because we have not made a - * critical change that has to be flushed to disk before xact commit --- the - * status-bit update could be redone by someone else just as easily. The - * buffer will be marked dirty, but it will not be written to disk until - * there is another reason to write it. - * - * This routine might get called many times on the same page, if we are making - * the first scan after commit of an xact that added/deleted many tuples. - * So, be as quick as we can if the buffer is already dirty. - */ -void -SetBufferCommitInfoNeedsSave(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - return; - - if (BAD_BUFFER_ID(buffer)) - return; - - bufHdr = &BufferDescriptors[buffer - 1]; - - if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != - (BM_DIRTY | BM_JUST_DIRTIED)) - { - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - SpinRelease(BufMgrLock); - } -} - -void -UnlockBuffers() -{ - BufferDesc *buf; - int i; - - for (i = 0; i < NBuffers; i++) - { - if (BufferLocks[i] == 0) - continue; - - Assert(BufferIsValid(i + 1)); - buf = &(BufferDescriptors[i]); - - S_LOCK(&(buf->cntx_lock)); - - if (BufferLocks[i] & BL_R_LOCK) - { - Assert(buf->r_locks > 0); - (buf->r_locks)--; - } - if (BufferLocks[i] & BL_RI_LOCK) - { - - /* - * Someone else could remove our RI lock when acquiring W - * lock. This is possible if we came here from elog(ERROR) - * from IpcSemaphore{Lock|Unlock}(WaitCLSemId). And so we - * don't do Assert(buf->ri_lock) here. - */ - buf->ri_lock = false; - } - if (BufferLocks[i] & BL_W_LOCK) - { - Assert(buf->w_lock); - buf->w_lock = false; - } - - S_UNLOCK(&(buf->cntx_lock)); - - BufferLocks[i] = 0; - } -} - -void -LockBuffer(Buffer buffer, int mode) -{ - BufferDesc *buf; - bits8 *buflock; - - Assert(BufferIsValid(buffer)); - if (BufferIsLocal(buffer)) - return; - - buf = &(BufferDescriptors[buffer - 1]); - buflock = &(BufferLocks[buffer - 1]); - - S_LOCK(&(buf->cntx_lock)); - - if (mode == BUFFER_LOCK_UNLOCK) - { - if (*buflock & BL_R_LOCK) - { - Assert(buf->r_locks > 0); - Assert(!(buf->w_lock)); - Assert(!(*buflock & (BL_W_LOCK | BL_RI_LOCK))); - (buf->r_locks)--; - *buflock &= ~BL_R_LOCK; - } - else if (*buflock & BL_W_LOCK) - { - Assert(buf->w_lock); - Assert(buf->r_locks == 0); - Assert(!(*buflock & (BL_R_LOCK | BL_RI_LOCK))); - buf->w_lock = false; - *buflock &= ~BL_W_LOCK; - } - else - elog(ERROR, "UNLockBuffer: buffer %lu is not locked", buffer); - } - else if (mode == BUFFER_LOCK_SHARE) - { - unsigned i = 0; - - Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK))); - while (buf->ri_lock || buf->w_lock) - { - S_UNLOCK(&(buf->cntx_lock)); - s_lock_sleep(i++); - S_LOCK(&(buf->cntx_lock)); - } - (buf->r_locks)++; - *buflock |= BL_R_LOCK; - } - else if (mode == BUFFER_LOCK_EXCLUSIVE) - { - unsigned i = 0; - - Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK))); - while (buf->r_locks > 0 || buf->w_lock) - { - if (buf->r_locks > 3 || (*buflock & BL_RI_LOCK)) - { - - /* - * Our RI lock might be removed by concurrent W lock - * acquiring (see what we do with RI locks below when our - * own W acquiring succeeded) and so we set RI lock again - * if we already did this. - */ - *buflock |= BL_RI_LOCK; - buf->ri_lock = true; - } - S_UNLOCK(&(buf->cntx_lock)); - s_lock_sleep(i++); - S_LOCK(&(buf->cntx_lock)); - } - buf->w_lock = true; - *buflock |= BL_W_LOCK; - - buf->cntxDirty = true; - - if (*buflock & BL_RI_LOCK) - { - - /* - * It's possible to remove RI locks acquired by another W - * lockers here, but they'll take care about it. - */ - buf->ri_lock = false; - *buflock &= ~BL_RI_LOCK; - } - } - else - elog(ERROR, "LockBuffer: unknown lock mode %d", mode); - - S_UNLOCK(&(buf->cntx_lock)); -} - -/* - * Functions for IO error handling - * - * Note : We assume that nested buffer IO never occur. - * i.e at most one io_in_progress spinlock is held - * per proc. -*/ -static BufferDesc *InProgressBuf = (BufferDesc *) NULL; -static bool IsForInput; - -/* - * Function:StartBufferIO - * (Assumptions) - * My process is executing no IO - * BufMgrLock is held - * BM_IO_IN_PROGRESS mask is not set for the buffer - * The buffer is Pinned - * -*/ -static void -StartBufferIO(BufferDesc *buf, bool forInput) -{ - Assert(!InProgressBuf); - Assert(!(buf->flags & BM_IO_IN_PROGRESS)); - buf->flags |= BM_IO_IN_PROGRESS; - - /* - * There used to be - * - * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); - * - * here, but that's wrong because of the way WaitIO works: someone else - * waiting for the I/O to complete will succeed in grabbing the lock - * for a few instructions, and if we context-swap back to here the - * Assert could fail. Tiny window for failure, but I've seen it - * happen -- tgl - */ - S_LOCK(&(buf->io_in_progress_lock)); - - InProgressBuf = buf; - IsForInput = forInput; -} - -/* - * Function:TerminateBufferIO - * (Assumptions) - * My process is executing IO for the buffer - * BufMgrLock is held - * The buffer is Pinned - * -*/ -static void -TerminateBufferIO(BufferDesc *buf) -{ - Assert(buf == InProgressBuf); - S_UNLOCK(&(buf->io_in_progress_lock)); - InProgressBuf = (BufferDesc *) 0; -} - -/* - * Function:ContinueBufferIO - * (Assumptions) - * My process is executing IO for the buffer - * BufMgrLock is held - * The buffer is Pinned - * -*/ -static void -ContinueBufferIO(BufferDesc *buf, bool forInput) -{ - Assert(buf == InProgressBuf); - Assert(buf->flags & BM_IO_IN_PROGRESS); - IsForInput = forInput; -} - -#ifdef NOT_USED -void -InitBufferIO(void) -{ - InProgressBuf = (BufferDesc *) 0; -} -#endif - -/* - * This function is called from ProcReleaseSpins(). - * BufMgrLock isn't held when this function is called. - * BM_IO_ERROR is always set. If BM_IO_ERROR was already - * set in case of output,this routine would kill all - * backends and reset postmaster. - */ -void -AbortBufferIO(void) -{ - BufferDesc *buf = InProgressBuf; - - if (buf) - { - Assert(buf->flags & BM_IO_IN_PROGRESS); - SpinAcquire(BufMgrLock); - if (IsForInput) - Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty)); - else - { - Assert(buf->flags & BM_DIRTY || buf->cntxDirty); - if (buf->flags & BM_IO_ERROR) - { - elog(NOTICE, "write error may be permanent: cannot write block %u for %s/%s", - buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); - } - buf->flags |= BM_DIRTY; - } - buf->flags |= BM_IO_ERROR; - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - SpinRelease(BufMgrLock); - } -} - -/* - * Cleanup buffer or mark it for cleanup. Buffer may be cleaned - * up if it's pinned only once. - * - * NOTE: buffer must be excl locked. - */ -void -MarkBufferForCleanup(Buffer buffer, void (*CleanupFunc)(Buffer)) -{ - BufferDesc *bufHdr = &BufferDescriptors[buffer - 1]; - - Assert(PrivateRefCount[buffer - 1] > 0); - - if (PrivateRefCount[buffer - 1] > 1) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - PrivateRefCount[buffer - 1]--; - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->CleanupFunc = CleanupFunc; - SpinRelease(BufMgrLock); - return; - } - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - if (bufHdr->refcount == 1) - { - SpinRelease(BufMgrLock); - CleanupFunc(buffer); - CleanupFunc = NULL; - } - else - SpinRelease(BufMgrLock); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - PrivateRefCount[buffer - 1]--; - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->CleanupFunc = CleanupFunc; - bufHdr->refcount--; - if (bufHdr->refcount == 0) - { - AddBufferToFreelist(bufHdr); - bufHdr->flags |= BM_FREE; - } - SpinRelease(BufMgrLock); - return; -} diff --git a/src/backend/storage/buffer/xlog_localbuf.c b/src/backend/storage/buffer/xlog_localbuf.c deleted file mode 100644 index dda7456e72c..00000000000 --- a/src/backend/storage/buffer/xlog_localbuf.c +++ /dev/null @@ -1,284 +0,0 @@ -/*------------------------------------------------------------------------- - * - * xlog_localbuf.c - * local buffer manager. Fast buffer manager for temporary tables - * or special cases when the operation is not visible to other backends. - * - * When a relation is being created, the descriptor will have rd_islocal - * set to indicate that the local buffer manager should be used. During - * the same transaction the relation is being created, any inserts or - * selects from the newly created relation will use the local buffer - * pool. rd_islocal is reset at the end of a transaction (commit/abort). - * This is useful for queries like SELECT INTO TABLE and create index. - * - * Portions Copyright (c) 1996-2000, PostgreSQL, Inc - * Portions Copyright (c) 1994-5, Regents of the University of California - * - * - * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_localbuf.c,v 1.2 2000/11/30 01:39:07 tgl Exp $ - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include <sys/types.h> -#include <sys/file.h> -#include <math.h> -#include <signal.h> - -#include "executor/execdebug.h" -#include "storage/buf_internals.h" -#include "storage/bufmgr.h" -#include "storage/smgr.h" -#include "utils/relcache.h" - -extern long int LocalBufferFlushCount; - -int NLocBuffer = 64; -BufferDesc *LocalBufferDescriptors = NULL; -Block *LocalBufferBlockPointers = NULL; -long *LocalRefCount = NULL; - -static int nextFreeLocalBuf = 0; - -/*#define LBDEBUG*/ - -/* - * LocalBufferAlloc - - * allocate a local buffer. We do round robin allocation for now. - */ -BufferDesc * -LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) -{ - int i; - BufferDesc *bufHdr = (BufferDesc *) NULL; - - if (blockNum == P_NEW) - { - blockNum = reln->rd_nblocks; - reln->rd_nblocks++; - } - - /* a low tech search for now -- not optimized for scans */ - for (i = 0; i < NLocBuffer; i++) - { - if (LocalBufferDescriptors[i].tag.rnode.relNode == - reln->rd_node.relNode && - LocalBufferDescriptors[i].tag.blockNum == blockNum) - { - -#ifdef LBDEBUG - fprintf(stderr, "LB ALLOC (%u,%d) %d\n", - RelationGetRelid(reln), blockNum, -i - 1); -#endif - LocalRefCount[i]++; - *foundPtr = TRUE; - return &LocalBufferDescriptors[i]; - } - } - -#ifdef LBDEBUG - fprintf(stderr, "LB ALLOC (%u,%d) %d\n", - RelationGetRelid(reln), blockNum, -nextFreeLocalBuf - 1); -#endif - - /* need to get a new buffer (round robin for now) */ - for (i = 0; i < NLocBuffer; i++) - { - int b = (nextFreeLocalBuf + i) % NLocBuffer; - - if (LocalRefCount[b] == 0) - { - bufHdr = &LocalBufferDescriptors[b]; - LocalRefCount[b]++; - nextFreeLocalBuf = (b + 1) % NLocBuffer; - break; - } - } - if (bufHdr == NULL) - elog(ERROR, "no empty local buffer."); - - /* - * this buffer is not referenced but it might still be dirty (the last - * transaction to touch it doesn't need its contents but has not - * flushed it). if that's the case, write it out before reusing it! - */ - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - Relation bufrel = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - - Assert(bufrel != NULL); - - /* flush this page */ - smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - LocalBufferFlushCount++; - - /* - * drop relcache refcount incremented by - * RelationIdCacheGetRelation - */ - RelationDecrementReferenceCount(bufrel); - } - - /* - * it's all ours now. - * - * We need not in tblNode currently but will in future I think, - * when we'll give up rel->rd_fd to fmgr cache. - */ - bufHdr->tag.rnode = reln->rd_node; - bufHdr->tag.blockNum = blockNum; - bufHdr->flags &= ~BM_DIRTY; - bufHdr->cntxDirty = false; - - /* - * lazy memory allocation: allocate space on first use of a buffer. - */ - if (bufHdr->data == (SHMEM_OFFSET) 0) - { - char *data = (char *) malloc(BLCKSZ); - - if (data == NULL) - elog(FATAL, "Out of memory in LocalBufferAlloc"); - /* - * This is a bit of a hack: bufHdr->data needs to be a shmem offset - * for consistency with the shared-buffer case, so make it one - * even though it's not really a valid shmem offset. - */ - bufHdr->data = MAKE_OFFSET(data); - /* - * Set pointer for use by BufferGetBlock() macro. - */ - LocalBufferBlockPointers[-(bufHdr->buf_id + 2)] = (Block) data; - } - - *foundPtr = FALSE; - return bufHdr; -} - -/* - * WriteLocalBuffer - - * writes out a local buffer - */ -int -WriteLocalBuffer(Buffer buffer, bool release) -{ - int bufid; - - Assert(BufferIsLocal(buffer)); - -#ifdef LBDEBUG - fprintf(stderr, "LB WRITE %d\n", buffer); -#endif - - bufid = -(buffer + 1); - LocalBufferDescriptors[bufid].flags |= BM_DIRTY; - - if (release) - { - Assert(LocalRefCount[bufid] > 0); - LocalRefCount[bufid]--; - } - - return true; -} - -/* - * InitLocalBuffer - - * init the local buffer cache. Since most queries (esp. multi-user ones) - * don't involve local buffers, we delay allocating actual memory for the - * buffer until we need it. - */ -void -InitLocalBuffer(void) -{ - int i; - - /* - * these aren't going away. I'm not gonna use palloc. - */ - LocalBufferDescriptors = - (BufferDesc *) calloc(NLocBuffer, sizeof(BufferDesc)); - LocalBufferBlockPointers = (Block *) calloc(NLocBuffer, sizeof(Block)); - LocalRefCount = (long *) calloc(NLocBuffer, sizeof(long)); - nextFreeLocalBuf = 0; - - for (i = 0; i < NLocBuffer; i++) - { - BufferDesc *buf = &LocalBufferDescriptors[i]; - - /* - * negative to indicate local buffer. This is tricky: shared - * buffers start with 0. We have to start with -2. (Note that the - * routine BufferDescriptorGetBuffer adds 1 to buf_id so our first - * buffer id is -1.) - */ - buf->buf_id = -i - 2; - } -} - -/* - * LocalBufferSync - * - * Flush all dirty buffers in the local buffer cache at commit time. - * Since the buffer cache is only used for keeping relations visible - * during a transaction, we will not need these buffers again. - * - * Note that we have to *flush* local buffers because of them are not - * visible to checkpoint makers. But we can skip XLOG flush check. - */ -void -LocalBufferSync(void) -{ - int i; - - for (i = 0; i < NLocBuffer; i++) - { - BufferDesc *buf = &LocalBufferDescriptors[i]; - Relation bufrel; - - if (buf->flags & BM_DIRTY || buf->cntxDirty) - { -#ifdef LBDEBUG - fprintf(stderr, "LB SYNC %d\n", -i - 1); -#endif - bufrel = RelationNodeCacheGetRelation(buf->tag.rnode); - - Assert(bufrel != NULL); - - smgrwrite(DEFAULT_SMGR, bufrel, buf->tag.blockNum, - (char *) MAKE_PTR(buf->data)); - smgrmarkdirty(DEFAULT_SMGR, bufrel, buf->tag.blockNum); - LocalBufferFlushCount++; - - /* drop relcache refcount from RelationIdCacheGetRelation */ - RelationDecrementReferenceCount(bufrel); - - buf->flags &= ~BM_DIRTY; - buf->cntxDirty = false; - } - } - - MemSet(LocalRefCount, 0, sizeof(long) * NLocBuffer); - nextFreeLocalBuf = 0; -} - -void -ResetLocalBufferPool(void) -{ - int i; - - for (i = 0; i < NLocBuffer; i++) - { - BufferDesc *buf = &LocalBufferDescriptors[i]; - - buf->tag.rnode.relNode = InvalidOid; - buf->flags &= ~BM_DIRTY; - buf->cntxDirty = false; - } - - MemSet(LocalRefCount, 0, sizeof(long) * NLocBuffer); - nextFreeLocalBuf = 0; -} |