aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/pg_visibility/pg_visibility.c29
-rw-r--r--src/backend/catalog/storage.c44
-rw-r--r--src/backend/storage/smgr/md.c28
-rw-r--r--src/backend/storage/smgr/smgr.c14
-rw-r--r--src/include/storage/md.h2
-rw-r--r--src/include/storage/smgr.h5
6 files changed, 89 insertions, 33 deletions
diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 5d0deaba61e..f97373e52e9 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -19,6 +19,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
+#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
@@ -390,6 +391,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
Relation rel;
ForkNumber fork;
BlockNumber block;
+ BlockNumber old_block;
rel = relation_open(relid, AccessExclusiveLock);
@@ -399,15 +401,22 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
/* Forcibly reset cached file size */
RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
+ /* Compute new and old size before entering critical section. */
+ fork = VISIBILITYMAP_FORKNUM;
block = visibilitymap_prepare_truncate(rel, 0);
- if (BlockNumberIsValid(block))
- {
- fork = VISIBILITYMAP_FORKNUM;
- smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);
- }
+ old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
+
+ /*
+ * WAL-logging, buffer dropping, file truncation must be atomic and all on
+ * one side of a checkpoint. See RelationTruncate() for discussion.
+ */
+ Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
+ START_CRIT_SECTION();
if (RelationNeedsWAL(rel))
{
+ XLogRecPtr lsn;
xl_smgr_truncate xlrec;
xlrec.blkno = 0;
@@ -417,9 +426,17 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
- XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+ lsn = XLogInsert(RM_SMGR_ID,
+ XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+ XLogFlush(lsn);
}
+ if (BlockNumberIsValid(block))
+ smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
+
+ END_CRIT_SECTION();
+ MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
+
/*
* Release the lock right away, not at commit time.
*
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index bb8c4d15612..5b22cf10990 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -291,6 +291,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
bool vm;
bool need_fsm_vacuum = false;
ForkNumber forks[MAX_FORKNUM];
+ BlockNumber old_blocks[MAX_FORKNUM];
BlockNumber blocks[MAX_FORKNUM];
int nforks = 0;
SMgrRelation reln;
@@ -306,6 +307,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
/* Prepare for truncation of MAIN fork of the relation */
forks[nforks] = MAIN_FORKNUM;
+ old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
blocks[nforks] = nblocks;
nforks++;
@@ -317,6 +319,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = FSM_FORKNUM;
+ old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
nforks++;
need_fsm_vacuum = true;
}
@@ -330,6 +333,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = VISIBILITYMAP_FORKNUM;
+ old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
nforks++;
}
}
@@ -366,14 +370,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
/*
- * We WAL-log the truncation before actually truncating, which means
- * trouble if the truncation fails. If we then crash, the WAL replay
- * likely isn't going to succeed in the truncation either, and cause a
- * PANIC. It's tempting to put a critical section here, but that cure
- * would be worse than the disease. It would turn a usually harmless
- * failure to truncate, that might spell trouble at WAL replay, into a
- * certain PANIC.
+ * We WAL-log the truncation first and then truncate in a critical
+ * section. Truncation drops buffers, even if dirty, and then truncates
+ * disk files. All of that work needs to complete before the lock is
+ * released, or else old versions of pages on disk that are missing recent
+ * changes would become accessible again. We'll try the whole operation
+ * again in crash recovery if we panic, but even then we can't give up
+ * because we don't want standbys' relation sizes to diverge and break
+ * replay or visibility invariants downstream. The critical section also
+ * suppresses interrupts.
+ *
+ * (See also pg_visibilitymap.c if changing this code.)
*/
+ START_CRIT_SECTION();
+
if (RelationNeedsWAL(rel))
{
/*
@@ -397,10 +407,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
* hit the disk before the WAL record, and the truncation of the FSM
* or visibility map. If we crashed during that window, we'd be left
* with a truncated heap, but the FSM or visibility map would still
- * contain entries for the non-existent heap pages.
+ * contain entries for the non-existent heap pages, and standbys would
+ * also never replay the truncation.
*/
- if (fsm || vm)
- XLogFlush(lsn);
+ XLogFlush(lsn);
}
/*
@@ -408,7 +418,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
* longer exist after truncation is complete, and then truncate the
* corresponding files on disk.
*/
- smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
+ smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
+
+ END_CRIT_SECTION();
/* We've done all the critical work, so checkpoints are OK now. */
MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
@@ -973,6 +985,7 @@ smgr_redo(XLogReaderState *record)
Relation rel;
ForkNumber forks[MAX_FORKNUM];
BlockNumber blocks[MAX_FORKNUM];
+ BlockNumber old_blocks[MAX_FORKNUM];
int nforks = 0;
bool need_fsm_vacuum = false;
@@ -1007,6 +1020,7 @@ smgr_redo(XLogReaderState *record)
if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
{
forks[nforks] = MAIN_FORKNUM;
+ old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
blocks[nforks] = xlrec->blkno;
nforks++;
@@ -1024,6 +1038,7 @@ smgr_redo(XLogReaderState *record)
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = FSM_FORKNUM;
+ old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
nforks++;
need_fsm_vacuum = true;
}
@@ -1035,13 +1050,18 @@ smgr_redo(XLogReaderState *record)
if (BlockNumberIsValid(blocks[nforks]))
{
forks[nforks] = VISIBILITYMAP_FORKNUM;
+ old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
nforks++;
}
}
/* Do the real work to truncate relation forks */
if (nforks > 0)
- smgrtruncate(reln, forks, nforks, blocks);
+ {
+ START_CRIT_SECTION();
+ smgrtruncate(reln, forks, nforks, old_blocks, blocks);
+ END_CRIT_SECTION();
+ }
/*
* Update upper-level FSM pages to account for the truncation. This is
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 4a897a83593..11fccda475f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1154,19 +1154,21 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
+ *
+ * Guaranteed not to allocate memory, so it can be used in a critical section.
+ * Caller must have called smgrnblocks() to obtain curnblk while holding a
+ * sufficient lock to prevent a change in relation size, and not used any smgr
+ * functions for this relation or handled interrupts in between. This makes
+ * sure we have opened all active segments, so that truncate loop will get
+ * them all!
*/
void
-mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+mdtruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber curnblk, BlockNumber nblocks)
{
- BlockNumber curnblk;
BlockNumber priorblocks;
int curopensegs;
- /*
- * NOTE: mdnblocks makes sure we have opened all active segments, so that
- * truncation loop will get them all!
- */
- curnblk = mdnblocks(reln, forknum);
if (nblocks > curnblk)
{
/* Bogus request ... but no complaint if InRecovery */
@@ -1505,7 +1507,7 @@ _fdvec_resize(SMgrRelation reln,
reln->md_seg_fds[forknum] =
MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
}
- else
+ else if (nseg > reln->md_num_open_segs[forknum])
{
/*
* It doesn't seem worthwhile complicating the code to amortize
@@ -1517,6 +1519,16 @@ _fdvec_resize(SMgrRelation reln,
repalloc(reln->md_seg_fds[forknum],
sizeof(MdfdVec) * nseg);
}
+ else
+ {
+ /*
+ * We don't reallocate a smaller array, because we want mdtruncate()
+ * to be able to promise that it won't allocate memory, so that it is
+ * allowed in a critical section. This means that a bit of space in
+ * the array is now wasted, until the next time we add a segment and
+ * reallocate.
+ */
+ }
reln->md_num_open_segs[forknum] = nseg;
}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 925728eb6c1..36ad34aa6ac 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -101,7 +101,7 @@ typedef struct f_smgr
BlockNumber blocknum, BlockNumber nblocks);
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks);
+ BlockNumber old_blocks, BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
} f_smgr;
@@ -719,10 +719,15 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
*
* The caller must hold AccessExclusiveLock on the relation, to ensure that
* other backends receive the smgr invalidation event that this function sends
- * before they access any forks of the relation again.
+ * before they access any forks of the relation again. The current size of
+ * the forks should be provided in old_nblocks. This function should normally
+ * be called in a critical section, but the current size must be checked
+ * outside the critical section, and no interrupts or smgr functions relating
+ * to this relation should be called in between.
*/
void
-smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
+smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
+ BlockNumber *old_nblocks, BlockNumber *nblocks)
{
int i;
@@ -750,7 +755,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
/* Make the cached size is invalid if we encounter an error. */
reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
- smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+ smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
+ old_nblocks[i], nblocks[i]);
/*
* We might as well update the local smgr_cached_nblocks values. The
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index b72293c79a5..e7671dd6c18 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -43,7 +43,7 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks);
+ BlockNumber old_blocks, BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void mdregistersync(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 5ab992f5bd5..63a186bd346 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -105,8 +105,9 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
-extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
- int nforks, BlockNumber *nblocks);
+extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
+ BlockNumber *old_nblocks,
+ BlockNumber *nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum);
extern void AtEOXact_SMgr(void);