diff options
author | Michael Paquier <michael@paquier.xyz> | 2023-02-06 08:28:42 +0900 |
---|---|---|
committer | Michael Paquier <michael@paquier.xyz> | 2023-02-06 08:28:42 +0900 |
commit | 2f6e15ac93c58c1140e4a4affe61e78f7346497a (patch) | |
tree | d2757eb6b456c8f3d882d53d711792d5ee2fdee4 /src/backend/access/transam/xlogarchive.c | |
parent | b2d0e13a0a4c31167d01e9871f907060c80b8fae (diff) | |
download | postgresql-2f6e15ac93c58c1140e4a4affe61e78f7346497a.tar.gz postgresql-2f6e15ac93c58c1140e4a4affe61e78f7346497a.zip |
Revert refactoring of restore command code to shell_restore.c
This reverts commits 24c35ec and 57169ad. PreRestoreCommand() and
PostRestoreCommand() need to be put closer to the system() call calling
a restore_command, as they enable in_restore_command for the startup
process which would in turn trigger an immediate proc_exit() in the
SIGTERM handler. Perhaps we could get rid of this behavior entirely,
but 24c35ec has made the window where the flag is enabled much larger
than it was, and any Postgres-like actions (palloc, etc.) taken by code
paths while the flag is enabled could lead to more severe issues in the
shutdown processing.
Note that curculio has showed that there are much more problems in this
area, unrelated to this change, actually, hence the issues related to
that had better be addressed first. Keeping the code of HEAD in line
with the stable branches should make that a bit easier.
Per discussion with Andres Freund and Nathan Bossart.
Discussion: https://postgr.es/m/Y979NR3U5VnWrTwB@paquier.xyz
Diffstat (limited to 'src/backend/access/transam/xlogarchive.c')
-rw-r--r-- | src/backend/access/transam/xlogarchive.c | 121 |
1 files changed, 118 insertions, 3 deletions
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 4b89addf976..fcc87ff44fd 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -22,6 +22,8 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogarchive.h" +#include "common/archive.h" +#include "common/percentrepl.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/startup.h" @@ -55,8 +57,9 @@ RestoreArchivedFile(char *path, const char *xlogfname, bool cleanupEnabled) { char xlogpath[MAXPGPATH]; + char *xlogRestoreCmd; char lastRestartPointFname[MAXPGPATH]; - bool ret; + int rc; struct stat stat_buf; XLogSegNo restartSegNo; XLogRecPtr restartRedoPtr; @@ -147,6 +150,15 @@ RestoreArchivedFile(char *path, const char *xlogfname, else XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size); + /* Build the restore command to execute */ + xlogRestoreCmd = BuildRestoreCommand(recoveryRestoreCommand, + xlogpath, xlogfname, + lastRestartPointFname); + + ereport(DEBUG3, + (errmsg_internal("executing restore command \"%s\"", + xlogRestoreCmd))); + /* * Check signals before restore command and reset afterwards. */ @@ -155,11 +167,15 @@ RestoreArchivedFile(char *path, const char *xlogfname, /* * Copy xlog from archival storage to XLOGDIR */ - ret = shell_restore(xlogfname, xlogpath, lastRestartPointFname); + fflush(NULL); + pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND); + rc = system(xlogRestoreCmd); + pgstat_report_wait_end(); PostRestoreCommand(); + pfree(xlogRestoreCmd); - if (ret) + if (rc == 0) { /* * command apparently succeeded, but let's make sure the file is @@ -215,6 +231,37 @@ RestoreArchivedFile(char *path, const char *xlogfname, } } + /* + * Remember, we rollforward UNTIL the restore fails so failure here is + * just part of the process... that makes it difficult to determine + * whether the restore failed because there isn't an archive to restore, + * or because the administrator has specified the restore program + * incorrectly. We have to assume the former. + * + * However, if the failure was due to any sort of signal, it's best to + * punt and abort recovery. (If we "return false" here, upper levels will + * assume that recovery is complete and start up the database!) It's + * essential to abort on child SIGINT and SIGQUIT, because per spec + * system() ignores SIGINT and SIGQUIT while waiting; if we see one of + * those it's a good bet we should have gotten it too. + * + * On SIGTERM, assume we have received a fast shutdown request, and exit + * cleanly. It's pure chance whether we receive the SIGTERM first, or the + * child process. If we receive it first, the signal handler will call + * proc_exit, otherwise we do it here. If we or the child process received + * SIGTERM for any other reason than a fast shutdown request, postmaster + * will perform an immediate shutdown when it sees us exiting + * unexpectedly. + * + * We treat hard shell errors such as "command not found" as fatal, too. + */ + if (wait_result_is_signal(rc, SIGTERM)) + proc_exit(1); + + ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2, + (errmsg("could not restore file \"%s\" from archive: %s", + xlogfname, wait_result_to_str(rc)))); + not_available: /* @@ -229,6 +276,74 @@ not_available: } /* + * Attempt to execute an external shell command during recovery. + * + * 'command' is the shell command to be executed, 'commandName' is a + * human-readable name describing the command emitted in the logs. If + * 'failOnSignal' is true and the command is killed by a signal, a FATAL + * error is thrown. Otherwise a WARNING is emitted. + * + * This is currently used for recovery_end_command and archive_cleanup_command. + */ +void +ExecuteRecoveryCommand(const char *command, const char *commandName, + bool failOnSignal, uint32 wait_event_info) +{ + char *xlogRecoveryCmd; + char lastRestartPointFname[MAXPGPATH]; + int rc; + XLogSegNo restartSegNo; + XLogRecPtr restartRedoPtr; + TimeLineID restartTli; + + Assert(command && commandName); + + /* + * Calculate the archive file cutoff point for use during log shipping + * replication. All files earlier than this point can be deleted from the + * archive, though there is no requirement to do so. + */ + GetOldestRestartPoint(&restartRedoPtr, &restartTli); + XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size); + XLogFileName(lastRestartPointFname, restartTli, restartSegNo, + wal_segment_size); + + /* + * construct the command to be executed + */ + xlogRecoveryCmd = replace_percent_placeholders(command, commandName, "r", lastRestartPointFname); + + ereport(DEBUG3, + (errmsg_internal("executing %s \"%s\"", commandName, command))); + + /* + * execute the constructed command + */ + fflush(NULL); + pgstat_report_wait_start(wait_event_info); + rc = system(xlogRecoveryCmd); + pgstat_report_wait_end(); + + pfree(xlogRecoveryCmd); + + if (rc != 0) + { + /* + * If the failure was due to any sort of signal, it's best to punt and + * abort recovery. See comments in RestoreArchivedFile(). + */ + ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING, + /*------ + translator: First %s represents a postgresql.conf parameter name like + "recovery_end_command", the 2nd is the value of that parameter, the + third an already translated error message. */ + (errmsg("%s \"%s\": %s", commandName, + command, wait_result_to_str(rc)))); + } +} + + +/* * A file was restored from the archive under a temporary filename (path), * and now we want to keep it. Rename it under the permanent filename in * pg_wal (xlogfname), replacing any existing file with the same name. |