diff options
author | Robert Haas <rhaas@postgresql.org> | 2024-07-26 14:50:21 -0400 |
---|---|---|
committer | Robert Haas <rhaas@postgresql.org> | 2024-07-26 15:00:48 -0400 |
commit | 8a53539bd603e5fe8fa52bdbb7277f6f49724522 (patch) | |
tree | 6142cb65a5945746a2b3b2dbee714e7985d63ede /src/backend/backup/basebackup_incremental.c | |
parent | 454aab4b738e53a5dbfca9251a7807a2ad21f87e (diff) | |
download | postgresql-8a53539bd603e5fe8fa52bdbb7277f6f49724522.tar.gz postgresql-8a53539bd603e5fe8fa52bdbb7277f6f49724522.zip |
Wait for WAL summarization to catch up before creating .partial file.
When a standby is promoted, CleanupAfterArchiveRecovery() may decide
to rename the final WAL file from the old timeline by adding ".partial"
to the name. If WAL summarization is enabled and this file is renamed
before its partial contents are summarized, WAL summarization breaks:
the summarizer gets stuck at that point in the WAL stream and just
errors out.
To fix that, first make the startup process wait for WAL summarization
to catch up before renaming the file. Generally, this should be quick,
and if it's not, the user can shut off summarize_wal and try again.
To make this fix work, also teach the WAL summarizer that after a
promotion has occurred, no more WAL can appear on the previous
timeline: previously, the WAL summarizer wouldn't switch to the new
timeline until we actually started writing WAL there, but that meant
that when the startup process was waiting for the WAL summarizer, it
was waiting for an action that the summarizer wasn't yet prepared to
take.
In the process of fixing these bugs, I realized that the logic to wait
for WAL summarization to catch up was spread out in a way that made
it difficult to reuse properly, so this code refactors things to make
it easier.
Finally, add a test case that would have caught this bug and the
previously-fixed bug that WAL summarization sometimes needs to back up
when the timeline changes.
Discussion: https://postgr.es/m/CA+TgmoZGEsZodXC4f=XZNkAeyuDmWTSkpkjCEOcF19Am0mt_OA@mail.gmail.com
Diffstat (limited to 'src/backend/backup/basebackup_incremental.c')
-rw-r--r-- | src/backend/backup/basebackup_incremental.c | 90 |
1 files changed, 6 insertions, 84 deletions
diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c index a023e624403..86fa6821ad2 100644 --- a/src/backend/backup/basebackup_incremental.c +++ b/src/backend/backup/basebackup_incremental.c @@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, TimeLineID earliest_wal_range_tli = 0; XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr; TimeLineID latest_wal_range_tli = 0; - XLogRecPtr summarized_lsn; - XLogRecPtr pending_lsn; - XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; - int deadcycles = 0; - TimestampTz initial_time, - current_time; Assert(ib->buf.data == NULL); @@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, } /* - * Wait for WAL summarization to catch up to the backup start LSN (but - * time out if it doesn't do so quickly enough). + * Wait for WAL summarization to catch up to the backup start LSN. This + * will throw an error if the WAL summarizer appears to be stuck. If WAL + * summarization gets disabled while we're waiting, this will return + * immediately, and we'll error out further down if the WAL summaries are + * incomplete. */ - initial_time = current_time = GetCurrentTimestamp(); - while (1) - { - long timeout_in_ms = 10000; - long elapsed_seconds; - - /* - * Align the wait time to prevent drift. This doesn't really matter, - * but we'd like the warnings about how long we've been waiting to say - * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever - * drifting to something that is not a multiple of ten. - */ - timeout_in_ms -= - TimestampDifferenceMilliseconds(initial_time, current_time) % - timeout_in_ms; - - /* Wait for up to 10 seconds. */ - summarized_lsn = WaitForWalSummarization(backup_state->startpoint, - timeout_in_ms, &pending_lsn); - - /* If WAL summarization has progressed sufficiently, stop waiting. */ - if (summarized_lsn >= backup_state->startpoint) - break; - - /* - * Keep track of the number of cycles during which there has been no - * progression of pending_lsn. If pending_lsn is not advancing, that - * means that not only are no new files appearing on disk, but we're - * not even incorporating new records into the in-memory state. - */ - if (pending_lsn > prior_pending_lsn) - { - prior_pending_lsn = pending_lsn; - deadcycles = 0; - } - else - ++deadcycles; - - /* - * If we've managed to wait for an entire minute without the WAL - * summarizer absorbing a single WAL record, error out; probably - * something is wrong. - * - * We could consider also erroring out if the summarizer is taking too - * long to catch up, but it's not clear what rate of progress would be - * acceptable and what would be too slow. So instead, we just try to - * error out in the case where there's no progress at all. That seems - * likely to catch a reasonable number of the things that can go wrong - * in practice (e.g. the summarizer process is completely hung, say - * because somebody hooked up a debugger to it or something) without - * giving up too quickly when the system is just slow. - */ - if (deadcycles >= 6) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summarization is not progressing"), - errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", - LSN_FORMAT_ARGS(backup_state->startpoint), - LSN_FORMAT_ARGS(summarized_lsn), - LSN_FORMAT_ARGS(pending_lsn)))); - - /* - * Otherwise, just let the user know what's happening. - */ - current_time = GetCurrentTimestamp(); - elapsed_seconds = - TimestampDifferenceMilliseconds(initial_time, current_time) / 1000; - ereport(WARNING, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("still waiting for WAL summarization through %X/%X after %ld seconds", - LSN_FORMAT_ARGS(backup_state->startpoint), - elapsed_seconds), - errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", - LSN_FORMAT_ARGS(summarized_lsn), - LSN_FORMAT_ARGS(pending_lsn)))); - } + WaitForWalSummarization(backup_state->startpoint); /* * Retrieve a list of all WAL summaries on any timeline that overlap with |