aboutsummaryrefslogtreecommitdiff
path: root/src/backend/commands
diff options
context:
space:
mode:
authorAlvaro Herrera <alvherre@alvh.no-ip.org>2013-12-16 11:29:50 -0300
committerAlvaro Herrera <alvherre@alvh.no-ip.org>2013-12-16 11:29:50 -0300
commit3b97e6823b949624afdc3ce4c92b29a80429715f (patch)
treea17cfce57aa3d963b0f7ab09c4b2649ed0a9eb50 /src/backend/commands
parent30b96549ab41ce23399256d4ea9723a05c139558 (diff)
downloadpostgresql-3b97e6823b949624afdc3ce4c92b29a80429715f.tar.gz
postgresql-3b97e6823b949624afdc3ce4c92b29a80429715f.zip
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
Diffstat (limited to 'src/backend/commands')
-rw-r--r--src/backend/commands/vacuumlazy.c31
1 files changed, 26 insertions, 5 deletions
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 28e98e8b481..8dd3de5e8e2 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -424,6 +424,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
Buffer vmbuffer = InvalidBuffer;
BlockNumber next_not_all_visible_block;
bool skipping_all_visible_blocks;
+ xl_heap_freeze_tuple *frozen;
pg_rusage_init(&ru0);
@@ -446,6 +447,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
vacrelstats->latestRemovedXid = InvalidTransactionId;
lazy_space_alloc(vacrelstats, nblocks);
+ frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
/*
* We want to skip pages that don't require vacuuming according to the
@@ -500,7 +502,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
bool tupgone,
hastup;
int prev_dead_count;
- OffsetNumber frozen[MaxOffsetNumber];
int nfrozen;
Size freespace;
bool all_visible_according_to_vm;
@@ -890,9 +891,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* Each non-removable tuple must be checked to see if it needs
* freezing. Note we already have exclusive buffer lock.
*/
- if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
- MultiXactCutoff))
- frozen[nfrozen++] = offnum;
+ if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
+ MultiXactCutoff, &frozen[nfrozen]))
+ frozen[nfrozen++].offset = offnum;
}
} /* scan along page */
@@ -903,15 +904,33 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
*/
if (nfrozen > 0)
{
+ START_CRIT_SECTION();
+
MarkBufferDirty(buf);
+
+ /* execute collected freezes */
+ for (i = 0; i < nfrozen; i++)
+ {
+ ItemId itemid;
+ HeapTupleHeader htup;
+
+ itemid = PageGetItemId(page, frozen[i].offset);
+ htup = (HeapTupleHeader) PageGetItem(page, itemid);
+
+ heap_execute_freeze_tuple(htup, &frozen[i]);
+ }
+
+ /* Now WAL-log freezing if neccessary */
if (RelationNeedsWAL(onerel))
{
XLogRecPtr recptr;
recptr = log_heap_freeze(onerel, buf, FreezeLimit,
- MultiXactCutoff, frozen, nfrozen);
+ frozen, nfrozen);
PageSetLSN(page, recptr);
}
+
+ END_CRIT_SECTION();
}
/*
@@ -1012,6 +1031,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
RecordPageWithFreeSpace(onerel, blkno, freespace);
}
+ pfree(frozen);
+
/* save stats for use later */
vacrelstats->scanned_tuples = num_tuples;
vacrelstats->tuples_deleted = tups_vacuumed;