aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/storage/buffer/bufmgr.c136
-rw-r--r--src/backend/storage/smgr/smgr.c30
-rw-r--r--src/include/storage/bufmgr.h2
-rw-r--r--src/include/storage/smgr.h1
4 files changed, 154 insertions, 15 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 71b5852224f..c192c2e35b5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -70,6 +70,14 @@
#define RELS_BSEARCH_THRESHOLD 20
+/*
+ * This is the size (in the number of blocks) above which we scan the
+ * entire buffer pool to remove the buffers for all the pages of relation
+ * being dropped. For the relations with size below this threshold, we find
+ * the buffers by doing lookups in BufMapping table.
+ */
+#define BUF_DROP_FULL_SCAN_THRESHOLD (uint32) (NBuffers / 32)
+
typedef struct PrivateRefCountEntry
{
Buffer buffer;
@@ -473,6 +481,10 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
BufferAccessStrategy strategy,
bool *foundPtr);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static void FindAndDropRelFileNodeBuffers(RelFileNode rnode,
+ ForkNumber forkNum,
+ BlockNumber nForkBlock,
+ BlockNumber firstDelBlock);
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
static int rnode_comparator(const void *p1, const void *p2);
@@ -2965,19 +2977,19 @@ BufferGetLSNAtomic(Buffer buffer)
* later. It is also the responsibility of higher-level code to ensure
* that no other process could be trying to load more pages of the
* relation into buffers.
- *
- * XXX currently it sequentially searches the buffer pool, should be
- * changed to more clever ways of searching. However, this routine
- * is used only in code paths that aren't very performance-critical,
- * and we shouldn't slow down the hot paths to make it faster ...
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
int nforks, BlockNumber *firstDelBlock)
{
int i;
int j;
+ RelFileNodeBackend rnode;
+ BlockNumber nForkBlock[MAX_FORKNUM];
+ BlockNumber nBlocksToInvalidate = 0;
+
+ rnode = smgr_reln->smgr_rnode;
/* If it's a local relation, it's localbuf.c's problem. */
if (RelFileNodeBackendIsTemp(rnode))
@@ -2991,6 +3003,56 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
return;
}
+ /*
+ * To remove all the pages of the specified relation forks from the buffer
+ * pool, we need to scan the entire buffer pool but we can optimize it by
+ * finding the buffers from BufMapping table provided we know the exact
+ * size of each fork of the relation. The exact size is required to ensure
+ * that we don't leave any buffer for the relation being dropped as
+ * otherwise the background writer or checkpointer can lead to a PANIC
+ * error while flushing buffers corresponding to files that don't exist.
+ *
+ * To know the exact size, we rely on the size cached for each fork by us
+ * during recovery which limits the optimization to recovery and on
+ * standbys but we can easily extend it once we have shared cache for
+ * relation size.
+ *
+ * In recovery, we cache the value returned by the first lseek(SEEK_END)
+ * and the future writes keeps the cached value up-to-date. See
+ * smgrextend. It is possible that the value of the first lseek is smaller
+ * than the actual number of existing blocks in the file due to buggy
+ * Linux kernels that might not have accounted for the recent write. But
+ * that should be fine because there must not be any buffers after that
+ * file size.
+ */
+ for (i = 0; i < nforks; i++)
+ {
+ /* Get the number of blocks for a relation's fork */
+ nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
+
+ if (nForkBlock[i] == InvalidBlockNumber)
+ {
+ nBlocksToInvalidate = InvalidBlockNumber;
+ break;
+ }
+
+ /* calculate the number of blocks to be invalidated */
+ nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
+ }
+
+ /*
+ * We apply the optimization iff the total number of blocks to invalidate
+ * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
+ */
+ if (BlockNumberIsValid(nBlocksToInvalidate) &&
+ nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
+ {
+ for (j = 0; j < nforks; j++)
+ FindAndDropRelFileNodeBuffers(rnode.node, forkNum[j],
+ nForkBlock[j], firstDelBlock[j]);
+ return;
+ }
+
for (i = 0; i < NBuffers; i++)
{
BufferDesc *bufHdr = GetBufferDescriptor(i);
@@ -3134,6 +3196,65 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
}
/* ---------------------------------------------------------------------
+ * FindAndDropRelFileNodeBuffers
+ *
+ * This function performs look up in BufMapping table and removes from the
+ * buffer pool all the pages of the specified relation fork that has block
+ * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
+ * pages are removed.)
+ * --------------------------------------------------------------------
+ */
+static void
+FindAndDropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber nForkBlock,
+ BlockNumber firstDelBlock)
+{
+ BlockNumber curBlock;
+
+ for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
+ {
+ uint32 bufHash; /* hash value for tag */
+ BufferTag bufTag; /* identity of requested block */
+ LWLock *bufPartitionLock; /* buffer partition lock for it */
+ int buf_id;
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+
+ /* create a tag so we can lookup the buffer */
+ INIT_BUFFERTAG(bufTag, rnode, forkNum, curBlock);
+
+ /* determine its hash code and partition lock ID */
+ bufHash = BufTableHashCode(&bufTag);
+ bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+ /* Check that it is in the buffer pool. If not, do nothing. */
+ LWLockAcquire(bufPartitionLock, LW_SHARED);
+ buf_id = BufTableLookup(&bufTag, bufHash);
+ LWLockRelease(bufPartitionLock);
+
+ if (buf_id < 0)
+ continue;
+
+ bufHdr = GetBufferDescriptor(buf_id);
+
+ /*
+ * We need to lock the buffer header and recheck if the buffer is
+ * still associated with the same block because the buffer could be
+ * evicted by some other backend loading blocks for a different
+ * relation after we release lock on the BufMapping table.
+ */
+ buf_state = LockBufHdr(bufHdr);
+
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+ bufHdr->tag.forkNum == forkNum &&
+ bufHdr->tag.blockNum >= firstDelBlock)
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
+/* ---------------------------------------------------------------------
* DropDatabaseBuffers
*
* This function removes all the buffers in the buffer cache for a
@@ -3245,8 +3366,7 @@ PrintPinnedBufs(void)
* XXX currently it sequentially searches the buffer pool, should be
* changed to more clever ways of searching. This routine is not
* used in any performance-critical code paths, so it's not worth
- * adding additional overhead to normal paths to make it go faster;
- * but see also DropRelFileNodeBuffers.
+ * adding additional overhead to normal paths to make it go faster.
* --------------------------------------------------------------------
*/
void
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 0f31ff38221..af603c3db3b 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -549,6 +549,28 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
{
BlockNumber result;
+ /* Check and return if we get the cached value for the number of blocks. */
+ result = smgrnblocks_cached(reln, forknum);
+ if (result != InvalidBlockNumber)
+ return result;
+
+ result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+
+ reln->smgr_cached_nblocks[forknum] = result;
+
+ return result;
+}
+
+/*
+ * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
+ * relation.
+ *
+ * Returns an InvalidBlockNumber when not in recovery and when the relation
+ * fork size is not cached.
+ */
+BlockNumber
+smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
+{
/*
* For now, we only use cached values in recovery due to lack of a shared
* invalidation mechanism for changes in file size.
@@ -556,11 +578,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
return reln->smgr_cached_nblocks[forknum];
- result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
-
- reln->smgr_cached_nblocks[forknum] = result;
-
- return result;
+ return InvalidBlockNumber;
}
/*
@@ -582,7 +600,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
* just drop them without bothering to write the contents.
*/
- DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
+ DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
/*
* Send a shared-inval message to force other backends to close any smgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ff6cd0fc54e..0c484f3addb 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -203,7 +203,7 @@ extern void FlushOneBuffer(Buffer buffer);
extern void FlushRelationBuffers(Relation rel);
extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
int nforks, BlockNumber *firstDelBlock);
extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index ebf4a199dcb..a6fbf7b6a6c 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -99,6 +99,7 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
int nforks, BlockNumber *nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);