diff options
Diffstat (limited to 'src')
42 files changed, 5451 insertions, 41 deletions
diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c index 21d68133ae1..f51d4282bb8 100644 --- a/src/backend/access/transam/xlogbackup.c +++ b/src/backend/access/transam/xlogbackup.c @@ -77,6 +77,16 @@ build_backup_content(BackupState *state, bool ishistoryfile) appendStringInfo(result, "STOP TIMELINE: %u\n", state->stoptli); } + /* either both istartpoint and istarttli should be set, or neither */ + Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0)); + if (!XLogRecPtrIsInvalid(state->istartpoint)) + { + appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n", + LSN_FORMAT_ARGS(state->istartpoint)); + appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n", + state->istarttli); + } + data = result->data; pfree(result); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index a2c8fa3981c..6f4f81f9927 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1295,6 +1295,12 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, tli_from_file, BACKUP_LABEL_FILE))); } + if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("this is an incremental backup, not a data directory"), + errhint("Use pg_combinebackup to reconstruct a valid data directory."))); + if (ferror(lfp) || FreeFile(lfp)) ereport(FATAL, (errcode_for_file_access(), diff --git a/src/backend/backup/Makefile b/src/backend/backup/Makefile index a67b3c58d47..751e6d3d5e2 100644 --- a/src/backend/backup/Makefile +++ b/src/backend/backup/Makefile @@ -19,6 +19,7 @@ OBJS = \ basebackup.o \ basebackup_copy.o \ basebackup_gzip.o \ + basebackup_incremental.o \ basebackup_lz4.o \ basebackup_zstd.o \ basebackup_progress.o \ diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 35dd79babcb..5ee9628422e 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -20,8 +20,10 @@ #include "access/xlogbackup.h" #include "backup/backup_manifest.h" #include "backup/basebackup.h" +#include "backup/basebackup_incremental.h" #include "backup/basebackup_sink.h" #include "backup/basebackup_target.h" +#include "catalog/pg_tablespace_d.h" #include "commands/defrem.h" #include "common/compression.h" #include "common/file_perm.h" @@ -33,6 +35,7 @@ #include "pgtar.h" #include "port.h" #include "postmaster/syslogger.h" +#include "postmaster/walsummarizer.h" #include "replication/walsender.h" #include "replication/walsender_private.h" #include "storage/bufpage.h" @@ -64,6 +67,7 @@ typedef struct bool fastcheckpoint; bool nowait; bool includewal; + bool incremental; uint32 maxrate; bool sendtblspcmapfile; bool send_to_client; @@ -76,21 +80,28 @@ typedef struct } basebackup_options; static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, - struct backup_manifest_info *manifest); + struct backup_manifest_info *manifest, + IncrementalBackupInfo *ib); static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks, - backup_manifest_info *manifest, Oid spcoid); + backup_manifest_info *manifest, Oid spcoid, + IncrementalBackupInfo *ib); static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, unsigned segno, - backup_manifest_info *manifest); + backup_manifest_info *manifest, + unsigned num_incremental_blocks, + BlockNumber *incremental_blocks, + unsigned truncation_block_length); static off_t read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, off_t offset, size_t length, BlockNumber blkno, bool verify_checksum, int *checksum_failures); +static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx, + size_t *bytes_done, void *data, size_t length); static bool verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, uint16 *expected_checksum); @@ -102,7 +113,8 @@ static int64 _tarWriteHeader(bbsink *sink, const char *filename, bool sizeonly); static void _tarWritePadding(bbsink *sink, int len); static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf); -static void perform_base_backup(basebackup_options *opt, bbsink *sink); +static void perform_base_backup(basebackup_options *opt, bbsink *sink, + IncrementalBackupInfo *ib); static void parse_basebackup_options(List *options, basebackup_options *opt); static int compareWalFileNames(const ListCell *a, const ListCell *b); static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, @@ -220,7 +232,8 @@ static const struct exclude_list_item excludeFiles[] = * clobbered by longjmp" from stupider versions of gcc. */ static void -perform_base_backup(basebackup_options *opt, bbsink *sink) +perform_base_backup(basebackup_options *opt, bbsink *sink, + IncrementalBackupInfo *ib) { bbsink_state state; XLogRecPtr endptr; @@ -270,6 +283,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) ListCell *lc; tablespaceinfo *newti; + /* If this is an incremental backup, execute preparatory steps. */ + if (ib != NULL) + PrepareForIncrementalBackup(ib, backup_state); + /* Add a node for the base directory at the end */ newti = palloc0(sizeof(tablespaceinfo)); newti->size = -1; @@ -289,10 +306,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) if (tmp->path == NULL) tmp->size = sendDir(sink, ".", 1, true, state.tablespaces, - true, NULL, InvalidOid); + true, NULL, InvalidOid, NULL); else tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true, - NULL); + NULL, NULL); state.bytes_total += tmp->size; } state.bytes_total_is_valid = true; @@ -330,7 +347,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) /* Then the bulk of the files... */ sendDir(sink, ".", 1, false, state.tablespaces, - sendtblspclinks, &manifest, InvalidOid); + sendtblspclinks, &manifest, InvalidOid, ib); /* ... and pg_control after everything else. */ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) @@ -340,7 +357,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) XLOG_CONTROL_FILE))); sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid, InvalidOid, - InvalidRelFileNumber, 0, &manifest); + InvalidRelFileNumber, 0, &manifest, 0, NULL, 0); } else { @@ -348,7 +365,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) bbsink_begin_archive(sink, archive_name); - sendTablespace(sink, ti->path, ti->oid, false, &manifest); + sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib); } /* @@ -610,7 +627,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) sendFile(sink, pathbuf, pathbuf, &statbuf, false, InvalidOid, InvalidOid, InvalidRelFileNumber, 0, - &manifest); + &manifest, 0, NULL, 0); /* unconditionally mark file as archived */ StatusFilePath(pathbuf, fname, ".done"); @@ -686,6 +703,7 @@ parse_basebackup_options(List *options, basebackup_options *opt) bool o_checkpoint = false; bool o_nowait = false; bool o_wal = false; + bool o_incremental = false; bool o_maxrate = false; bool o_tablespace_map = false; bool o_noverify_checksums = false; @@ -764,6 +782,20 @@ parse_basebackup_options(List *options, basebackup_options *opt) opt->includewal = defGetBoolean(defel); o_wal = true; } + else if (strcmp(defel->defname, "incremental") == 0) + { + if (o_incremental) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->incremental = defGetBoolean(defel); + if (opt->incremental && !summarize_wal) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("incremental backups cannot be taken unless WAL summarization is enabled"))); + opt->incremental = defGetBoolean(defel); + o_incremental = true; + } else if (strcmp(defel->defname, "max_rate") == 0) { int64 maxrate; @@ -956,7 +988,7 @@ parse_basebackup_options(List *options, basebackup_options *opt) * the filesystem, bypassing the buffer cache. */ void -SendBaseBackup(BaseBackupCmd *cmd) +SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib) { basebackup_options opt; bbsink *sink; @@ -981,6 +1013,20 @@ SendBaseBackup(BaseBackupCmd *cmd) } /* + * If we're asked to perform an incremental backup and the user has not + * supplied a manifest, that's an ERROR. + * + * If we're asked to perform a full backup and the user did supply a + * manifest, just ignore it. + */ + if (!opt.incremental) + ib = NULL; + else if (ib == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP"))); + + /* * If the target is specifically 'client' then set up to stream the backup * to the client; otherwise, it's being sent someplace else and should not * be sent to the client. BaseBackupGetSink has the job of setting up a @@ -1011,7 +1057,7 @@ SendBaseBackup(BaseBackupCmd *cmd) */ PG_TRY(); { - perform_base_backup(&opt, sink); + perform_base_backup(&opt, sink, ib); } PG_FINALLY(); { @@ -1089,7 +1135,7 @@ sendFileWithContent(bbsink *sink, const char *filename, const char *content, */ static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, - backup_manifest_info *manifest) + backup_manifest_info *manifest, IncrementalBackupInfo *ib) { int64 size; char pathbuf[MAXPGPATH]; @@ -1123,7 +1169,7 @@ sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, /* Send all the files in the tablespace version directory */ size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest, - spcoid); + spcoid, ib); return size; } @@ -1143,7 +1189,7 @@ sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest, - Oid spcoid) + Oid spcoid, IncrementalBackupInfo *ib) { DIR *dir; struct dirent *de; @@ -1152,7 +1198,16 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, int64 size = 0; const char *lastDir; /* Split last dir from parent path. */ bool isRelationDir = false; /* Does directory contain relations? */ + bool isGlobalDir = false; Oid dboid = InvalidOid; + BlockNumber *relative_block_numbers = NULL; + + /* + * Since this array is relatively large, avoid putting it on the stack. + * But we don't need it at all if this is not an incremental backup. + */ + if (ib != NULL) + relative_block_numbers = palloc(sizeof(BlockNumber) * RELSEG_SIZE); /* * Determine if the current path is a database directory that can contain @@ -1185,7 +1240,10 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, } } else if (strcmp(path, "./global") == 0) + { isRelationDir = true; + isGlobalDir = true; + } dir = AllocateDir(path); while ((de = ReadDir(dir, path)) != NULL) @@ -1334,11 +1392,13 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, &statbuf, sizeonly); /* - * Also send archive_status directory (by hackishly reusing - * statbuf from above ...). + * Also send archive_status and summaries directories (by + * hackishly reusing statbuf from above ...). */ size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL, &statbuf, sizeonly); + size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL, + &statbuf, sizeonly); continue; /* don't recurse into pg_wal */ } @@ -1407,16 +1467,64 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, if (!skip_this_dir) size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces, - sendtblspclinks, manifest, spcoid); + sendtblspclinks, manifest, spcoid, ib); } else if (S_ISREG(statbuf.st_mode)) { bool sent = false; + unsigned num_blocks_required = 0; + unsigned truncation_block_length = 0; + char tarfilenamebuf[MAXPGPATH * 2]; + char *tarfilename = pathbuf + basepathlen + 1; + FileBackupMethod method = BACK_UP_FILE_FULLY; + + if (ib != NULL && isRelationFile) + { + Oid relspcoid; + char *lookup_path; + + if (OidIsValid(spcoid)) + { + relspcoid = spcoid; + lookup_path = psprintf("pg_tblspc/%u/%s", spcoid, + tarfilename); + } + else + { + if (isGlobalDir) + relspcoid = GLOBALTABLESPACE_OID; + else + relspcoid = DEFAULTTABLESPACE_OID; + lookup_path = pstrdup(tarfilename); + } + + method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid, + relfilenumber, relForkNum, + segno, statbuf.st_size, + &num_blocks_required, + relative_block_numbers, + &truncation_block_length); + if (method == BACK_UP_FILE_INCREMENTALLY) + { + statbuf.st_size = + GetIncrementalFileSize(num_blocks_required); + snprintf(tarfilenamebuf, sizeof(tarfilenamebuf), + "%s/INCREMENTAL.%s", + path + basepathlen + 1, + de->d_name); + tarfilename = tarfilenamebuf; + } + + pfree(lookup_path); + } if (!sizeonly) - sent = sendFile(sink, pathbuf, pathbuf + basepathlen + 1, &statbuf, + sent = sendFile(sink, pathbuf, tarfilename, &statbuf, true, dboid, spcoid, - relfilenumber, segno, manifest); + relfilenumber, segno, manifest, + num_blocks_required, + method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL, + truncation_block_length); if (sent || sizeonly) { @@ -1434,6 +1542,10 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, ereport(WARNING, (errmsg("skipping special file \"%s\"", pathbuf))); } + + if (relative_block_numbers != NULL) + pfree(relative_block_numbers); + FreeDir(dir); return size; } @@ -1446,6 +1558,12 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, * If dboid is anything other than InvalidOid then any checksum failures * detected will get reported to the cumulative stats system. * + * If the file is to be sent incrementally, then num_incremental_blocks + * should be the number of blocks to be sent, and incremental_blocks + * an array of block numbers relative to the start of the current segment. + * If the whole file is to be sent, then incremental_blocks should be NULL, + * and num_incremental_blocks can have any value, as it will be ignored. + * * Returns true if the file was successfully sent, false if 'missing_ok', * and the file did not exist. */ @@ -1453,7 +1571,8 @@ static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, unsigned segno, - backup_manifest_info *manifest) + backup_manifest_info *manifest, unsigned num_incremental_blocks, + BlockNumber *incremental_blocks, unsigned truncation_block_length) { int fd; BlockNumber blkno = 0; @@ -1462,6 +1581,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, pgoff_t bytes_done = 0; bool verify_checksum = false; pg_checksum_context checksum_ctx; + int ibindex = 0; if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) elog(ERROR, "could not initialize checksum of file \"%s\"", @@ -1495,21 +1615,110 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, verify_checksum = true; /* + * If we're sending an incremental file, write the file header. + */ + if (incremental_blocks != NULL) + { + unsigned magic = INCREMENTAL_MAGIC; + size_t header_bytes_done = 0; + + /* Emit header data. */ + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &magic, sizeof(magic)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &num_incremental_blocks, sizeof(num_incremental_blocks)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &truncation_block_length, sizeof(truncation_block_length)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + incremental_blocks, + sizeof(BlockNumber) * num_incremental_blocks); + + /* Flush out any data still in the buffer so it's again empty. */ + if (header_bytes_done > 0) + { + bbsink_archive_contents(sink, header_bytes_done); + if (pg_checksum_update(&checksum_ctx, + (uint8 *) sink->bbs_buffer, + header_bytes_done) < 0) + elog(ERROR, "could not update checksum of base backup"); + } + + /* Update our notion of file position. */ + bytes_done += sizeof(magic); + bytes_done += sizeof(num_incremental_blocks); + bytes_done += sizeof(truncation_block_length); + bytes_done += sizeof(BlockNumber) * num_incremental_blocks; + } + + /* * Loop until we read the amount of data the caller told us to expect. The * file could be longer, if it was extended while we were sending it, but * for a base backup we can ignore such extended data. It will be restored * from WAL. */ - while (bytes_done < statbuf->st_size) + while (1) { - size_t remaining = statbuf->st_size - bytes_done; + /* + * Determine whether we've read all the data that we need, and if not, + * read some more. + */ + if (incremental_blocks == NULL) + { + size_t remaining = statbuf->st_size - bytes_done; + + /* + * If we've read the required number of bytes, then it's time to + * stop. + */ + if (bytes_done >= statbuf->st_size) + break; + + /* + * Read as many bytes as will fit in the buffer, or however many + * are left to read, whichever is less. + */ + cnt = read_file_data_into_buffer(sink, readfilename, fd, + bytes_done, remaining, + blkno + segno * RELSEG_SIZE, + verify_checksum, + &checksum_failures); + } + else + { + BlockNumber relative_blkno; - /* Try to read some more data. */ - cnt = read_file_data_into_buffer(sink, readfilename, fd, bytes_done, - remaining, - blkno + segno * RELSEG_SIZE, - verify_checksum, - &checksum_failures); + /* + * If we've read all the blocks, then it's time to stop. + */ + if (ibindex >= num_incremental_blocks) + break; + + /* + * Read just one block, whichever one is the next that we're + * supposed to include. + */ + relative_blkno = incremental_blocks[ibindex++]; + cnt = read_file_data_into_buffer(sink, readfilename, fd, + relative_blkno * BLCKSZ, + BLCKSZ, + relative_blkno + segno * RELSEG_SIZE, + verify_checksum, + &checksum_failures); + + /* + * If we get a partial read, that must mean that the relation is + * being truncated. Ultimately, it should be truncated to a + * multiple of BLCKSZ, since this path should only be reached for + * relation files, but we might transiently observe an + * intermediate value. + * + * It should be fine to treat this just as if the entire block had + * been truncated away - i.e. fill this and all later blocks with + * zeroes. WAL replay will fix things up. + */ + if (cnt < BLCKSZ) + break; + } /* * If the amount of data we were able to read was not a multiple of @@ -1693,6 +1902,56 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, } /* + * Push data into a bbsink. + * + * It's better, when possible, to read data directly into the bbsink's buffer, + * rather than using this function to copy it into the buffer; this function is + * for cases where that approach is not practical. + * + * bytes_done should point to a count of the number of bytes that are + * currently used in the bbsink's buffer. Upon return, the bytes identified by + * data and length will have been copied into the bbsink's buffer, flushing + * as required, and *bytes_done will have been updated accordingly. If the + * buffer was flushed, the previous contents will also have been fed to + * checksum_ctx. + * + * Note that after one or more calls to this function it is the caller's + * responsibility to perform any required final flush. + */ +static void +push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx, + size_t *bytes_done, void *data, size_t length) +{ + while (length > 0) + { + size_t bytes_to_copy; + + /* + * We use < here rather than <= so that if the data exactly fills the + * remaining buffer space, we trigger a flush now. + */ + if (length < sink->bbs_buffer_length - *bytes_done) + { + /* Append remaining data to buffer. */ + memcpy(sink->bbs_buffer + *bytes_done, data, length); + *bytes_done += length; + return; + } + + /* Copy until buffer is full and flush it. */ + bytes_to_copy = sink->bbs_buffer_length - *bytes_done; + memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy); + data = ((char *) data) + bytes_to_copy; + length -= bytes_to_copy; + bbsink_archive_contents(sink, sink->bbs_buffer_length); + if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer, + sink->bbs_buffer_length) < 0) + elog(ERROR, "could not update checksum"); + *bytes_done = 0; + } +} + +/* * Try to verify the checksum for the provided page, if it seems appropriate * to do so. * diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c new file mode 100644 index 00000000000..1e5a5ac33ad --- /dev/null +++ b/src/backend/backup/basebackup_incremental.c @@ -0,0 +1,1003 @@ +/*------------------------------------------------------------------------- + * + * basebackup_incremental.c + * code for incremental backup support + * + * This code isn't actually in charge of taking an incremental backup; + * the actual construction of the incremental backup happens in + * basebackup.c. Here, we're concerned with providing the necessary + * supports for that operation. In particular, we need to parse the + * backup manifest supplied by the user taking the incremental backup + * and extract the required information from it. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/backup/basebackup_incremental.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlogrecovery.h" +#include "backup/basebackup_incremental.h" +#include "backup/walsummary.h" +#include "common/blkreftable.h" +#include "common/parse_manifest.h" +#include "common/hashfn.h" +#include "postmaster/walsummarizer.h" + +#define BLOCKS_PER_READ 512 + +/* + * Details extracted from the WAL ranges present in the supplied backup manifest. + */ +typedef struct +{ + TimeLineID tli; + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; +} backup_wal_range; + +/* + * Details extracted from the file list present in the supplied backup manifest. + */ +typedef struct +{ + uint32 status; + const char *path; + size_t size; +} backup_file_entry; + +static uint32 hash_string_pointer(const char *s); +#define SH_PREFIX backup_file +#define SH_ELEMENT_TYPE backup_file_entry +#define SH_KEY_TYPE const char * +#define SH_KEY path +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +struct IncrementalBackupInfo +{ + /* Memory context for this object and its subsidiary objects. */ + MemoryContext mcxt; + + /* Temporary buffer for storing the manifest while parsing it. */ + StringInfoData buf; + + /* WAL ranges extracted from the backup manifest. */ + List *manifest_wal_ranges; + + /* + * Files extracted from the backup manifest. + * + * We don't really need this information, because we use WAL summaries to + * figure what's changed. It would be unsafe to just rely on the list of + * files that existed before, because it's possible for a file to be + * removed and a new one created with the same name and different + * contents. In such cases, the whole file must still be sent. We can tell + * from the WAL summaries whether that happened, but not from the file + * list. + * + * Nonetheless, this data is useful for sanity checking. If a file that we + * think we shouldn't need to send is not present in the manifest for the + * prior backup, something has gone terribly wrong. We retain the file + * names and sizes, but not the checksums or last modified times, for + * which we have no use. + * + * One significant downside of storing this data is that it consumes + * memory. If that turns out to be a problem, we might have to decide not + * to retain this information, or to make it optional. + */ + backup_file_hash *manifest_files; + + /* + * Block-reference table for the incremental backup. + * + * It's possible that storing the entire block-reference table in memory + * will be a problem for some users. The in-memory format that we're using + * here is pretty efficient, converging to little more than 1 bit per + * block for relation forks with large numbers of modified blocks. It's + * possible, however, that if you try to perform an incremental backup of + * a database with a sufficiently large number of relations on a + * sufficiently small machine, you could run out of memory here. If that + * turns out to be a problem in practice, we'll need to be more clever. + */ + BlockRefTable *brtab; +}; + +static void manifest_process_file(JsonManifestParseContext *context, + char *pathname, + size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void manifest_process_wal_range(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +static void manifest_report_error(JsonManifestParseContext *ib, + const char *fmt,...) + pg_attribute_printf(2, 3) pg_attribute_noreturn(); +static int compare_block_numbers(const void *a, const void *b); + +/* + * Create a new object for storing information extracted from the manifest + * supplied when creating an incremental backup. + */ +IncrementalBackupInfo * +CreateIncrementalBackupInfo(MemoryContext mcxt) +{ + IncrementalBackupInfo *ib; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(mcxt); + + ib = palloc0(sizeof(IncrementalBackupInfo)); + ib->mcxt = mcxt; + initStringInfo(&ib->buf); + + /* + * It's hard to guess how many files a "typical" installation will have in + * the data directory, but a fresh initdb creates almost 1000 files as of + * this writing, so it seems to make sense for our estimate to + * substantially higher. + */ + ib->manifest_files = backup_file_create(mcxt, 10000, NULL); + + MemoryContextSwitchTo(oldcontext); + + return ib; +} + +/* + * Before taking an incremental backup, the caller must supply the backup + * manifest from a prior backup. Each chunk of manifest data recieved + * from the client should be passed to this function. + */ +void +AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, + int len) +{ + MemoryContext oldcontext; + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* + * XXX. Our json parser is at present incapable of parsing json blobs + * incrementally, so we have to accumulate the entire backup manifest + * before we can do anything with it. This should really be fixed, since + * some users might have very large numbers of files in the data + * directory. + */ + appendBinaryStringInfo(&ib->buf, data, len); + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Finalize an IncrementalBackupInfo object after all manifest data has + * been supplied via calls to AppendIncrementalManifestData. + */ +void +FinalizeIncrementalManifest(IncrementalBackupInfo *ib) +{ + JsonManifestParseContext context; + MemoryContext oldcontext; + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* Parse the manifest. */ + context.private_data = ib; + context.per_file_cb = manifest_process_file; + context.per_wal_range_cb = manifest_process_wal_range; + context.error_cb = manifest_report_error; + json_parse_manifest(&context, ib->buf.data, ib->buf.len); + + /* Done with the buffer, so release memory. */ + pfree(ib->buf.data); + ib->buf.data = NULL; + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Prepare to take an incremental backup. + * + * Before this function is called, AppendIncrementalManifestData and + * FinalizeIncrementalManifest should have already been called to pass all + * the manifest data to this object. + * + * This function performs sanity checks on the data extracted from the + * manifest and figures out for which WAL ranges we need summaries, and + * whether those summaries are available. Then, it reads and combines the + * data from those summary files. It also updates the backup_state with the + * reference TLI and LSN for the prior backup. + */ +void +PrepareForIncrementalBackup(IncrementalBackupInfo *ib, + BackupState *backup_state) +{ + MemoryContext oldcontext; + List *expectedTLEs; + List *all_wslist, + *required_wslist = NIL; + ListCell *lc; + TimeLineHistoryEntry **tlep; + int num_wal_ranges; + int i; + bool found_backup_start_tli = false; + TimeLineID earliest_wal_range_tli = 0; + XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr; + TimeLineID latest_wal_range_tli = 0; + XLogRecPtr summarized_lsn; + XLogRecPtr pending_lsn; + XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; + int deadcycles = 0; + TimestampTz initial_time, + current_time; + + Assert(ib->buf.data == NULL); + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* + * A valid backup manifest must always contain at least one WAL range + * (usually exactly one, unless the backup spanned a timeline switch). + */ + num_wal_ranges = list_length(ib->manifest_wal_ranges); + if (num_wal_ranges == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest contains no required WAL ranges"))); + + /* + * Match up the TLIs that appear in the WAL ranges of the backup manifest + * with those that appear in this server's timeline history. We expect + * every backup_wal_range to match to a TimeLineHistoryEntry; if it does + * not, that's an error. + * + * This loop also decides which of the WAL ranges is the manifest is most + * ancient and which one is the newest, according to the timeline history + * of this server, and stores TLIs of those WAL ranges into + * earliest_wal_range_tli and latest_wal_range_tli. It also updates + * earliest_wal_range_start_lsn to the start LSN of the WAL range for + * earliest_wal_range_tli. + * + * Note that the return value of readTimeLineHistory puts the latest + * timeline at the beginning of the list, not the end. Hence, the earliest + * TLI is the one that occurs nearest the end of the list returned by + * readTimeLineHistory, and the latest TLI is the one that occurs closest + * to the beginning. + */ + expectedTLEs = readTimeLineHistory(backup_state->starttli); + tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *)); + for (i = 0; i < num_wal_ranges; ++i) + { + backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); + bool saw_earliest_wal_range_tli = false; + bool saw_latest_wal_range_tli = false; + + /* Search this server's history for this WAL range's TLI. */ + foreach(lc, expectedTLEs) + { + TimeLineHistoryEntry *tle = lfirst(lc); + + if (tle->tli == range->tli) + { + tlep[i] = tle; + break; + } + + if (tle->tli == earliest_wal_range_tli) + saw_earliest_wal_range_tli = true; + if (tle->tli == latest_wal_range_tli) + saw_latest_wal_range_tli = true; + } + + /* + * An incremental backup can only be taken relative to a backup that + * represents a previous state of this server. If the backup requires + * WAL from a timeline that's not in our history, that definitely + * isn't the case. + */ + if (tlep[i] == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("timeline %u found in manifest, but not in this server's history", + range->tli))); + + /* + * If we found this TLI in the server's history before encountering + * the latest TLI seen so far in the server's history, then this TLI + * is the latest one seen so far. + * + * If on the other hand we saw the earliest TLI seen so far before + * finding this TLI, this TLI is earlier than the earliest one seen so + * far. And if this is the first TLI for which we've searched, it's + * also the earliest one seen so far. + * + * On the first loop iteration, both things should necessarily be + * true. + */ + if (!saw_latest_wal_range_tli) + latest_wal_range_tli = range->tli; + if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli) + { + earliest_wal_range_tli = range->tli; + earliest_wal_range_start_lsn = range->start_lsn; + } + } + + /* + * Propagate information about the prior backup into the backup_label that + * will be generated for this backup. + */ + backup_state->istartpoint = earliest_wal_range_start_lsn; + backup_state->istarttli = earliest_wal_range_tli; + + /* + * Sanity check start and end LSNs for the WAL ranges in the manifest. + * + * Commonly, there won't be any timeline switches during the prior backup + * at all, but if there are, they should happen at the same LSNs that this + * server switched timelines. + * + * Whether there are any timeline switches during the prior backup or not, + * the prior backup shouldn't require any WAL from a timeline prior to the + * start of that timeline. It also shouldn't require any WAL from later + * than the start of this backup. + * + * If any of these sanity checks fail, one possible explanation is that + * the user has generated WAL on the same timeline with the same LSNs more + * than once. For instance, if two standbys running on timeline 1 were + * both promoted and (due to a broken archiving setup) both selected new + * timeline ID 2, then it's possible that one of these checks might trip. + * + * Note that there are lots of ways for the user to do something very bad + * without tripping any of these checks, and they are not intended to be + * comprehensive. It's pretty hard to see how we could be certain of + * anything here. However, if there's a problem staring us right in the + * face, it's best to report it, so we do. + */ + for (i = 0; i < num_wal_ranges; ++i) + { + backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); + + if (range->tli == earliest_wal_range_tli) + { + if (range->start_lsn < tlep[i]->begin) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->start_lsn), + LSN_FORMAT_ARGS(tlep[i]->begin)))); + } + else + { + if (range->start_lsn != tlep[i]->begin) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->start_lsn), + LSN_FORMAT_ARGS(tlep[i]->begin)))); + } + + if (range->tli == latest_wal_range_tli) + { + if (range->end_lsn > backup_state->startpoint) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->end_lsn), + LSN_FORMAT_ARGS(backup_state->startpoint)))); + } + else + { + if (range->end_lsn != tlep[i]->end) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->end_lsn), + LSN_FORMAT_ARGS(tlep[i]->end)))); + } + + } + + /* + * Wait for WAL summarization to catch up to the backup start LSN (but + * time out if it doesn't do so quickly enough). + */ + initial_time = current_time = GetCurrentTimestamp(); + while (1) + { + long timeout_in_ms = 10000; + unsigned elapsed_seconds; + + /* + * Align the wait time to prevent drift. This doesn't really matter, + * but we'd like the warnings about how long we've been waiting to say + * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever + * drifting to something that is not a multiple of ten. + */ + timeout_in_ms -= + TimestampDifferenceMilliseconds(current_time, initial_time) % + timeout_in_ms; + + /* Wait for up to 10 seconds. */ + summarized_lsn = WaitForWalSummarization(backup_state->startpoint, + 10000, &pending_lsn); + + /* If WAL summarization has progressed sufficiently, stop waiting. */ + if (summarized_lsn >= backup_state->startpoint) + break; + + /* + * Keep track of the number of cycles during which there has been no + * progression of pending_lsn. If pending_lsn is not advancing, that + * means that not only are no new files appearing on disk, but we're + * not even incorporating new records into the in-memory state. + */ + if (pending_lsn > prior_pending_lsn) + { + prior_pending_lsn = pending_lsn; + deadcycles = 0; + } + else + ++deadcycles; + + /* + * If we've managed to wait for an entire minute withot the WAL + * summarizer absorbing a single WAL record, error out; probably + * something is wrong. + * + * We could consider also erroring out if the summarizer is taking too + * long to catch up, but it's not clear what rate of progress would be + * acceptable and what would be too slow. So instead, we just try to + * error out in the case where there's no progress at all. That seems + * likely to catch a reasonable number of the things that can go wrong + * in practice (e.g. the summarizer process is completely hung, say + * because somebody hooked up a debugger to it or something) without + * giving up too quickly when the sytem is just slow. + */ + if (deadcycles >= 6) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summarization is not progressing"), + errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", + LSN_FORMAT_ARGS(backup_state->startpoint), + LSN_FORMAT_ARGS(summarized_lsn), + LSN_FORMAT_ARGS(pending_lsn)))); + + /* + * Otherwise, just let the user know what's happening. + */ + current_time = GetCurrentTimestamp(); + elapsed_seconds = + TimestampDifferenceMilliseconds(initial_time, current_time) / 1000; + ereport(WARNING, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("still waiting for WAL summarization through %X/%X after %d seconds", + LSN_FORMAT_ARGS(backup_state->startpoint), + elapsed_seconds), + errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", + LSN_FORMAT_ARGS(summarized_lsn), + LSN_FORMAT_ARGS(pending_lsn)))); + } + + /* + * Retrieve a list of all WAL summaries on any timeline that overlap with + * the LSN range of interest. We could instead call GetWalSummaries() once + * per timeline in the loop that follows, but that would involve reading + * the directory multiple times. It should be mildly faster - and perhaps + * a bit safer - to do it just once. + */ + all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn, + backup_state->startpoint); + + /* + * We need WAL summaries for everything that happened during the prior + * backup and everything that happened afterward up until the point where + * the current backup started. + */ + foreach(lc, expectedTLEs) + { + TimeLineHistoryEntry *tle = lfirst(lc); + XLogRecPtr tli_start_lsn = tle->begin; + XLogRecPtr tli_end_lsn = tle->end; + XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr; + List *tli_wslist; + + /* + * Working through the history of this server from the current + * timeline backwards, we skip everything until we find the timeline + * where this backup started. Most of the time, this means we won't + * skip anything at all, as it's unlikely that the timeline has + * changed since the beginning of the backup moments ago. + */ + if (tle->tli == backup_state->starttli) + { + found_backup_start_tli = true; + tli_end_lsn = backup_state->startpoint; + } + else if (!found_backup_start_tli) + continue; + + /* + * Find the summaries that overlap the LSN range of interest for this + * timeline. If this is the earliest timeline involved, the range of + * interest begins with the start LSN of the prior backup; otherwise, + * it begins at the LSN at which this timeline came into existence. If + * this is the latest TLI involved, the range of interest ends at the + * start LSN of the current backup; otherwise, it ends at the point + * where we switched from this timeline to the next one. + */ + if (tle->tli == earliest_wal_range_tli) + tli_start_lsn = earliest_wal_range_start_lsn; + tli_wslist = FilterWalSummaries(all_wslist, tle->tli, + tli_start_lsn, tli_end_lsn); + + /* + * There is no guarantee that the WAL summaries we found cover the + * entire range of LSNs for which summaries are required, or indeed + * that we found any WAL summaries at all. Check whether we have a + * problem of that sort. + */ + if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn, + &tli_missing_lsn)) + { + if (XLogRecPtrIsInvalid(tli_missing_lsn)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", + tle->tli, + LSN_FORMAT_ARGS(tli_start_lsn), + LSN_FORMAT_ARGS(tli_end_lsn)))); + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", + tle->tli, + LSN_FORMAT_ARGS(tli_start_lsn), + LSN_FORMAT_ARGS(tli_end_lsn)), + errdetail("The first unsummarized LSN is this range is %X/%X.", + LSN_FORMAT_ARGS(tli_missing_lsn)))); + } + + /* + * Remember that we need to read these summaries. + * + * Technically, it's possible that this could read more files than + * required, since tli_wslist in theory could contain redundant + * summaries. For instance, if we have a summary from 0/10000000 to + * 0/20000000 and also one from 0/00000000 to 0/30000000, then the + * latter subsumes the former and the former could be ignored. + * + * We ignore this possibility because the WAL summarizer only tries to + * generate summaries that do not overlap. If somehow they exist, + * we'll do a bit of extra work but the results should still be + * correct. + */ + required_wslist = list_concat(required_wslist, tli_wslist); + + /* + * Timelines earlier than the one in which the prior backup began are + * not relevant. + */ + if (tle->tli == earliest_wal_range_tli) + break; + } + + /* + * Read all of the required block reference table files and merge all of + * the data into a single in-memory block reference table. + * + * See the comments for struct IncrementalBackupInfo for some thoughts on + * memory usage. + */ + ib->brtab = CreateEmptyBlockRefTable(); + foreach(lc, required_wslist) + { + WalSummaryFile *ws = lfirst(lc); + WalSummaryIO wsio; + BlockRefTableReader *reader; + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber limit_block; + BlockNumber blocks[BLOCKS_PER_READ]; + + wsio.file = OpenWalSummaryFile(ws, false); + wsio.filepos = 0; + ereport(DEBUG1, + (errmsg_internal("reading WAL summary file \"%s\"", + FilePathName(wsio.file)))); + reader = CreateBlockRefTableReader(ReadWalSummary, &wsio, + FilePathName(wsio.file), + ReportWalSummaryError, NULL); + while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum, + &limit_block)) + { + BlockRefTableSetLimitBlock(ib->brtab, &rlocator, + forknum, limit_block); + + while (1) + { + unsigned nblocks; + unsigned i; + + nblocks = BlockRefTableReaderGetBlocks(reader, blocks, + BLOCKS_PER_READ); + if (nblocks == 0) + break; + + for (i = 0; i < nblocks; ++i) + BlockRefTableMarkBlockModified(ib->brtab, &rlocator, + forknum, blocks[i]); + } + } + DestroyBlockRefTableReader(reader); + FileClose(wsio.file); + } + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Get the pathname that should be used when a file is sent incrementally. + * + * The result is a palloc'd string. + */ +char * +GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber, + ForkNumber forknum, unsigned segno) +{ + char *path; + char *lastslash; + char *ipath; + + path = GetRelationPath(dboid, spcoid, relfilenumber, InvalidBackendId, + forknum); + + lastslash = strrchr(path, '/'); + Assert(lastslash != NULL); + *lastslash = '\0'; + + if (segno > 0) + ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno); + else + ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1); + + pfree(path); + + return ipath; +} + +/* + * How should we back up a particular file as part of an incremental backup? + * + * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole + * file just as if this were not an incremental backup. + * + * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include + * an incremental file in the backup instead of the entire file. On return, + * *num_blocks_required will be set to the number of blocks that need to be + * sent, and the actual block numbers will have been stored in + * relative_block_numbers, which should be an array of at least RELSEG_SIZE. + * In addition, *truncation_block_length will be set to the value that should + * be included in the incremental file. + */ +FileBackupMethod +GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path, + Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, ForkNumber forknum, + unsigned segno, size_t size, + unsigned *num_blocks_required, + BlockNumber *relative_block_numbers, + unsigned *truncation_block_length) +{ + BlockNumber absolute_block_numbers[RELSEG_SIZE]; + BlockNumber limit_block; + BlockNumber start_blkno; + BlockNumber stop_blkno; + RelFileLocator rlocator; + BlockRefTableEntry *brtentry; + unsigned i; + unsigned nblocks; + + /* Should only be called after PrepareForIncrementalBackup. */ + Assert(ib->buf.data == NULL); + + /* + * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber + * should have legal values. + */ + Assert(OidIsValid(spcoid)); + Assert(RelFileNumberIsValid(relfilenumber)); + + /* + * If the file size is too large or not a multiple of BLCKSZ, then + * something weird is happening, so give up and send the whole file. + */ + if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE) + return BACK_UP_FILE_FULLY; + + /* + * The free-space map fork is not properly WAL-logged, so we need to + * backup the entire file every time. + */ + if (forknum == FSM_FORKNUM) + return BACK_UP_FILE_FULLY; + + /* + * If this file was not part of the prior backup, back it up fully. + * + * If this file was created after the prior backup and before the start of + * the current backup, then the WAL summary information will tell us to + * back up the whole file. However, if this file was created after the + * start of the current backup, then the WAL summary won't know anything + * about it. Without this logic, we would erroneously conclude that it was + * OK to send it incrementally. + * + * Note that the file could have existed at the time of the prior backup, + * gotten deleted, and then a new file with the same name could have been + * created. In that case, this logic won't prevent the file from being + * backed up incrementally. But, if the deletion happened before the start + * of the current backup, the limit block will be 0, inducing a full + * backup. If the deletion happened after the start of the current backup, + * reconstruction will erroneously combine blocks from the current + * lifespan of the file with blocks from the previous lifespan -- but in + * this type of case, WAL replay to reach backup consistency should remove + * and recreate the file anyway, so the initial bogus contents should not + * matter. + */ + if (backup_file_lookup(ib->manifest_files, path) == NULL) + { + char *ipath; + + ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber, + forknum, segno); + if (backup_file_lookup(ib->manifest_files, ipath) == NULL) + return BACK_UP_FILE_FULLY; + } + + /* Look up the block reference table entry. */ + rlocator.spcOid = spcoid; + rlocator.dbOid = dboid; + rlocator.relNumber = relfilenumber; + brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum, + &limit_block); + + /* + * If there is no entry, then there have been no WAL-logged changes to the + * relation since the predecessor backup was taken, so we can back it up + * incrementally and need not include any modified blocks. + * + * However, if the file is zero-length, we should do a full backup, + * because an incremental file is always more than zero length, and it's + * silly to take an incremental backup when a full backup would be + * smaller. + */ + if (brtentry == NULL) + { + if (size == 0) + return BACK_UP_FILE_FULLY; + *num_blocks_required = 0; + *truncation_block_length = size / BLCKSZ; + return BACK_UP_FILE_INCREMENTALLY; + } + + /* + * If the limit_block is less than or equal to the point where this + * segment starts, send the whole file. + */ + if (limit_block <= segno * RELSEG_SIZE) + return BACK_UP_FILE_FULLY; + + /* + * Get relevant entries from the block reference table entry. + * + * We shouldn't overflow computing the start or stop block numbers, but if + * it manages to happen somehow, detect it and throw an error. + */ + start_blkno = segno * RELSEG_SIZE; + stop_blkno = start_blkno + (size / BLCKSZ); + if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno) + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("overflow computing block number bounds for segment %u with size %zu", + segno, size)); + nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno, + absolute_block_numbers, RELSEG_SIZE); + Assert(nblocks <= RELSEG_SIZE); + + /* + * If we're going to have to send nearly all of the blocks, then just send + * the whole file, because that won't require much extra storage or + * transfer and will speed up and simplify backup restoration. It's not + * clear what threshold is most appropriate here and perhaps it ought to + * be configurable, but for now we're just going to say that if we'd need + * to send 90% of the blocks anyway, give up and send the whole file. + * + * NB: If you change the threshold here, at least make sure to back up the + * file fully when every single block must be sent, because there's + * nothing good about sending an incremental file in that case. + */ + if (nblocks * BLCKSZ > size * 0.9) + return BACK_UP_FILE_FULLY; + + /* + * Looks like we can send an incremental file, so sort the absolute the + * block numbers and then transpose absolute block numbers to relative + * block numbers. + * + * NB: If the block reference table was using the bitmap representation + * for a given chunk, the block numbers in that chunk will already be + * sorted, but when the array-of-offsets representation is used, we can + * receive block numbers here out of order. + */ + qsort(absolute_block_numbers, nblocks, sizeof(BlockNumber), + compare_block_numbers); + for (i = 0; i < nblocks; ++i) + relative_block_numbers[i] = absolute_block_numbers[i] - start_blkno; + *num_blocks_required = nblocks; + + /* + * The truncation block length is the minimum length of the reconstructed + * file. Any block numbers below this threshold that are not present in + * the backup need to be fetched from the prior backup. At or above this + * threshold, blocks should only be included in the result if they are + * present in the backup. (This may require inserting zero blocks if the + * blocks included in the backup are non-consecutive.) + */ + *truncation_block_length = size / BLCKSZ; + if (BlockNumberIsValid(limit_block)) + { + unsigned relative_limit = limit_block - segno * RELSEG_SIZE; + + if (*truncation_block_length < relative_limit) + *truncation_block_length = relative_limit; + } + + /* Send it incrementally. */ + return BACK_UP_FILE_INCREMENTALLY; +} + +/* + * Compute the size for an incremental file containing a given number of blocks. + */ +extern size_t +GetIncrementalFileSize(unsigned num_blocks_required) +{ + size_t result; + + /* Make sure we're not going to overflow. */ + Assert(num_blocks_required <= RELSEG_SIZE); + + /* + * Three four byte quantities (magic number, truncation block length, + * block count) followed by block numbers followed by block contents. + */ + result = 3 * sizeof(uint32); + result += (BLCKSZ + sizeof(BlockNumber)) * num_blocks_required; + + return result; +} + +/* + * Helper function for filemap hash table. + */ +static uint32 +hash_string_pointer(const char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} + +/* + * This callback is invoked for each file mentioned in the backup manifest. + * + * We store the path to each file and the size of each file for sanity-checking + * purposes. For further details, see comments for IncrementalBackupInfo. + */ +static void +manifest_process_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload) +{ + IncrementalBackupInfo *ib = context->private_data; + backup_file_entry *entry; + bool found; + + entry = backup_file_insert(ib->manifest_files, pathname, &found); + if (!found) + { + entry->path = MemoryContextStrdup(ib->manifest_files->ctx, + pathname); + entry->size = size; + } +} + +/* + * This callback is invoked for each WAL range mentioned in the backup + * manifest. + * + * We're just interested in learning the oldest LSN and the corresponding TLI + * that appear in any WAL range. + */ +static void +manifest_process_wal_range(JsonManifestParseContext *context, + TimeLineID tli, XLogRecPtr start_lsn, + XLogRecPtr end_lsn) +{ + IncrementalBackupInfo *ib = context->private_data; + backup_wal_range *range = palloc(sizeof(backup_wal_range)); + + range->tli = tli; + range->start_lsn = start_lsn; + range->end_lsn = end_lsn; + ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range); +} + +/* + * This callback is invoked if an error occurs while parsing the backup + * manifest. + */ +static void +manifest_report_error(JsonManifestParseContext *context, const char *fmt,...) +{ + StringInfoData errbuf; + + initStringInfo(&errbuf); + + for (;;) + { + va_list ap; + int needed; + + va_start(ap, fmt); + needed = appendStringInfoVA(&errbuf, fmt, ap); + va_end(ap); + if (needed == 0) + break; + enlargeStringInfo(&errbuf, needed); + } + + ereport(ERROR, + errmsg_internal("%s", errbuf.data)); +} + +/* + * Quicksort comparator for block numbers. + */ +static int +compare_block_numbers(const void *a, const void *b) +{ + BlockNumber aa = *(BlockNumber *) a; + BlockNumber bb = *(BlockNumber *) b; + + if (aa > bb) + return 1; + else if (aa == bb) + return 0; + else + return -1; +} diff --git a/src/backend/backup/meson.build b/src/backend/backup/meson.build index 5d4ebe3ebed..2a6a2dc7c0e 100644 --- a/src/backend/backup/meson.build +++ b/src/backend/backup/meson.build @@ -5,6 +5,7 @@ backend_sources += files( 'basebackup.c', 'basebackup_copy.c', 'basebackup_gzip.c', + 'basebackup_incremental.c', 'basebackup_lz4.c', 'basebackup_progress.c', 'basebackup_server.c', diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index 0c874e33cf6..a5d118ed683 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -76,11 +76,12 @@ Node *replication_parse_result; %token K_EXPORT_SNAPSHOT %token K_NOEXPORT_SNAPSHOT %token K_USE_SNAPSHOT +%token K_UPLOAD_MANIFEST %type <node> command %type <node> base_backup start_replication start_logical_replication create_replication_slot drop_replication_slot identify_system - read_replication_slot timeline_history show + read_replication_slot timeline_history show upload_manifest %type <list> generic_option_list %type <defelt> generic_option %type <uintval> opt_timeline @@ -114,6 +115,7 @@ command: | read_replication_slot | timeline_history | show + | upload_manifest ; /* @@ -307,6 +309,15 @@ timeline_history: } ; +/* UPLOAD_MANIFEST doesn't currently accept any arguments */ +upload_manifest: + K_UPLOAD_MANIFEST + { + UploadManifestCmd *cmd = makeNode(UploadManifestCmd); + + $$ = (Node *) cmd; + } + opt_physical: K_PHYSICAL | /* EMPTY */ @@ -411,6 +422,7 @@ ident_or_keyword: | K_EXPORT_SNAPSHOT { $$ = "export_snapshot"; } | K_NOEXPORT_SNAPSHOT { $$ = "noexport_snapshot"; } | K_USE_SNAPSHOT { $$ = "use_snapshot"; } + | K_UPLOAD_MANIFEST { $$ = "upload_manifest"; } ; %% diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index 1cc7fb858cd..4805da08ee3 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -136,6 +136,7 @@ EXPORT_SNAPSHOT { return K_EXPORT_SNAPSHOT; } NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } USE_SNAPSHOT { return K_USE_SNAPSHOT; } WAIT { return K_WAIT; } +UPLOAD_MANIFEST { return K_UPLOAD_MANIFEST; } {space}+ { /* do nothing */ } @@ -303,6 +304,7 @@ replication_scanner_is_replication_command(void) case K_DROP_REPLICATION_SLOT: case K_READ_REPLICATION_SLOT: case K_TIMELINE_HISTORY: + case K_UPLOAD_MANIFEST: case K_SHOW: /* Yes; push back the first token so we can parse later. */ repl_pushed_back_token = first_token; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 3bc9c823895..dbcda325540 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -58,6 +58,7 @@ #include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "backup/basebackup.h" +#include "backup/basebackup_incremental.h" #include "catalog/pg_authid.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" @@ -138,6 +139,17 @@ bool wake_wal_senders = false; static XLogReaderState *xlogreader = NULL; /* + * If the UPLOAD_MANIFEST command is used to provide a backup manifest in + * preparation for an incremental backup, uploaded_manifest will be point + * to an object containing information about its contexts, and + * uploaded_manifest_mcxt will point to the memory context that contains + * that object and all of its subordinate data. Otherwise, both values will + * be NULL. + */ +static IncrementalBackupInfo *uploaded_manifest = NULL; +static MemoryContext uploaded_manifest_mcxt = NULL; + +/* * These variables keep track of the state of the timeline we're currently * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, * the timeline is not the latest timeline on this server, and the server's @@ -233,6 +245,9 @@ static void XLogSendLogical(void); static void WalSndDone(WalSndSendDataCallback send_data); static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); static void IdentifySystem(void); +static void UploadManifest(void); +static bool HandleUploadManifestPacket(StringInfo buf, off_t *offset, + IncrementalBackupInfo *ib); static void ReadReplicationSlot(ReadReplicationSlotCmd *cmd); static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); static void DropReplicationSlot(DropReplicationSlotCmd *cmd); @@ -661,6 +676,143 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd) } /* + * Handle UPLOAD_MANIFEST command. + */ +static void +UploadManifest(void) +{ + MemoryContext mcxt; + IncrementalBackupInfo *ib; + off_t offset = 0; + StringInfoData buf; + + /* + * parsing the manifest will use the cryptohash stuff, which requires a + * resource owner + */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup"); + + /* Prepare to read manifest data into a temporary context. */ + mcxt = AllocSetContextCreate(CurrentMemoryContext, + "incremental backup information", + ALLOCSET_DEFAULT_SIZES); + ib = CreateIncrementalBackupInfo(mcxt); + + /* Send a CopyInResponse message */ + pq_beginmessage(&buf, 'G'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage_reuse(&buf); + pq_flush(); + + /* Recieve packets from client until done. */ + while (HandleUploadManifestPacket(&buf, &offset, ib)) + ; + + /* Finish up manifest processing. */ + FinalizeIncrementalManifest(ib); + + /* + * Discard any old manifest information and arrange to preserve the new + * information we just got. + * + * We assume that MemoryContextDelete and MemoryContextSetParent won't + * fail, and thus we shouldn't end up bailing out of here in such a way as + * to leave dangling pointrs. + */ + if (uploaded_manifest_mcxt != NULL) + MemoryContextDelete(uploaded_manifest_mcxt); + MemoryContextSetParent(mcxt, CacheMemoryContext); + uploaded_manifest = ib; + uploaded_manifest_mcxt = mcxt; + + /* clean up the resource owner we created */ + WalSndResourceCleanup(true); +} + +/* + * Process one packet received during the handling of an UPLOAD_MANIFEST + * operation. + * + * 'buf' is scratch space. This function expects it to be initialized, doesn't + * care what the current contents are, and may override them with completely + * new contents. + * + * The return value is true if the caller should continue processing + * additional packets and false if the UPLOAD_MANIFEST operation is complete. + */ +static bool +HandleUploadManifestPacket(StringInfo buf, off_t *offset, + IncrementalBackupInfo *ib) +{ + int mtype; + int maxmsglen; + + HOLD_CANCEL_INTERRUPTS(); + + pq_startmsgread(); + mtype = pq_getbyte(); + if (mtype == EOF) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("unexpected EOF on client connection with an open transaction"))); + + switch (mtype) + { + case 'd': /* CopyData */ + maxmsglen = PQ_LARGE_MESSAGE_LIMIT; + break; + case 'c': /* CopyDone */ + case 'f': /* CopyFail */ + case 'H': /* Flush */ + case 'S': /* Sync */ + maxmsglen = PQ_SMALL_MESSAGE_LIMIT; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message type 0x%02X during COPY from stdin", + mtype))); + maxmsglen = 0; /* keep compiler quiet */ + break; + } + + /* Now collect the message body */ + if (pq_getmessage(buf, maxmsglen)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("unexpected EOF on client connection with an open transaction"))); + RESUME_CANCEL_INTERRUPTS(); + + /* Process the message */ + switch (mtype) + { + case 'd': /* CopyData */ + AppendIncrementalManifestData(ib, buf->data, buf->len); + return true; + + case 'c': /* CopyDone */ + return false; + + case 'H': /* Sync */ + case 'S': /* Flush */ + /* Ignore these while in CopyOut mode as we do elsewhere. */ + return true; + + case 'f': + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("COPY from stdin failed: %s", + pq_getmsgstring(buf)))); + } + + /* Not reached. */ + Assert(false); + return false; +} + +/* * Handle START_REPLICATION command. * * At the moment, this never returns, but an ereport(ERROR) will take us back @@ -1801,7 +1953,7 @@ exec_replication_command(const char *cmd_string) cmdtag = "BASE_BACKUP"; set_ps_display(cmdtag); PreventInTransactionBlock(true, cmdtag); - SendBaseBackup((BaseBackupCmd *) cmd_node); + SendBaseBackup((BaseBackupCmd *) cmd_node, uploaded_manifest); EndReplicationCommand(cmdtag); break; @@ -1863,6 +2015,14 @@ exec_replication_command(const char *cmd_string) } break; + case T_UploadManifestCmd: + cmdtag = "UPLOAD_MANIFEST"; + set_ps_display(cmdtag); + PreventInTransactionBlock(true, cmdtag); + UploadManifest(); + EndReplicationCommand(cmdtag); + break; + default: elog(ERROR, "unrecognized replication command node tag: %u", cmd_node->type); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 0e0ac22bdd6..706140eb9f4 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -32,6 +32,7 @@ #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" +#include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/slot.h" @@ -140,6 +141,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, ReplicationOriginShmemSize()); size = add_size(size, WalSndShmemSize()); size = add_size(size, WalRcvShmemSize()); + size = add_size(size, WalSummarizerShmemSize()); size = add_size(size, PgArchShmemSize()); size = add_size(size, ApplyLauncherShmemSize()); size = add_size(size, BTreeShmemSize()); @@ -337,6 +339,7 @@ CreateOrAttachShmemStructs(void) ReplicationOriginShmemInit(); WalSndShmemInit(); WalRcvShmemInit(); + WalSummarizerShmemInit(); PgArchShmemInit(); ApplyLauncherShmemInit(); diff --git a/src/bin/Makefile b/src/bin/Makefile index 373077bf52b..aa2210925e2 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -19,6 +19,7 @@ SUBDIRS = \ pg_archivecleanup \ pg_basebackup \ pg_checksums \ + pg_combinebackup \ pg_config \ pg_controldata \ pg_ctl \ diff --git a/src/bin/meson.build b/src/bin/meson.build index 67cb50630c5..4cb6fd59bb8 100644 --- a/src/bin/meson.build +++ b/src/bin/meson.build @@ -5,6 +5,7 @@ subdir('pg_amcheck') subdir('pg_archivecleanup') subdir('pg_basebackup') subdir('pg_checksums') +subdir('pg_combinebackup') subdir('pg_config') subdir('pg_controldata') subdir('pg_ctl') diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c index 45f32974ff6..6b78ee283d9 100644 --- a/src/bin/pg_basebackup/bbstreamer_file.c +++ b/src/bin/pg_basebackup/bbstreamer_file.c @@ -296,6 +296,7 @@ should_allow_existing_directory(const char *pathname) if (strcmp(filename, "pg_wal") == 0 || strcmp(filename, "pg_xlog") == 0 || strcmp(filename, "archive_status") == 0 || + strcmp(filename, "summaries") == 0 || strcmp(filename, "pg_tblspc") == 0) return true; diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index f32684a8f23..5795b91261f 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -102,6 +102,11 @@ typedef void (*WriteDataCallback) (size_t nbytes, char *buf, #define MINIMUM_VERSION_FOR_TERMINATED_TARFILE 150000 /* + * pg_wal/summaries exists beginning with version 17. + */ +#define MINIMUM_VERSION_FOR_WAL_SUMMARIES 170000 + +/* * Different ways to include WAL */ typedef enum @@ -217,7 +222,8 @@ static void ReceiveBackupManifestInMemoryChunk(size_t r, char *copybuf, void *callback_data); static void BaseBackup(char *compression_algorithm, char *compression_detail, CompressionLocation compressloc, - pg_compress_specification *client_compress); + pg_compress_specification *client_compress, + char *incremental_manifest); static bool reached_end_position(XLogRecPtr segendpos, uint32 timeline, bool segment_finished); @@ -390,6 +396,8 @@ usage(void) printf(_("\nOptions controlling the output:\n")); printf(_(" -D, --pgdata=DIRECTORY receive base backup into directory\n")); printf(_(" -F, --format=p|t output format (plain (default), tar)\n")); + printf(_(" -i, --incremental=OLDMANIFEST\n")); + printf(_(" take incremental backup\n")); printf(_(" -r, --max-rate=RATE maximum transfer rate to transfer data directory\n" " (in kB/s, or use suffix \"k\" or \"M\")\n")); printf(_(" -R, --write-recovery-conf\n" @@ -688,6 +696,23 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && errno != EEXIST) pg_fatal("could not create directory \"%s\": %m", statusdir); + + /* + * For newer server versions, likewise create pg_wal/summaries + */ + if (PQserverVersion(conn) < MINIMUM_VERSION_FOR_WAL_SUMMARIES) + { + char summarydir[MAXPGPATH]; + + snprintf(summarydir, sizeof(summarydir), "%s/%s/summaries", + basedir, + PQserverVersion(conn) < MINIMUM_VERSION_FOR_PG_WAL ? + "pg_xlog" : "pg_wal"); + + if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && + errno != EEXIST) + pg_fatal("could not create directory \"%s\": %m", summarydir); + } } /* @@ -1728,7 +1753,9 @@ ReceiveBackupManifestInMemoryChunk(size_t r, char *copybuf, static void BaseBackup(char *compression_algorithm, char *compression_detail, - CompressionLocation compressloc, pg_compress_specification *client_compress) + CompressionLocation compressloc, + pg_compress_specification *client_compress, + char *incremental_manifest) { PGresult *res; char *sysidentifier; @@ -1794,7 +1821,76 @@ BaseBackup(char *compression_algorithm, char *compression_detail, exit(1); /* - * Start the actual backup + * If the user wants an incremental backup, we must upload the manifest + * for the previous backup upon which it is to be based. + */ + if (incremental_manifest != NULL) + { + int fd; + char mbuf[65536]; + int nbytes; + + /* Reject if server is too old. */ + if (serverVersion < MINIMUM_VERSION_FOR_WAL_SUMMARIES) + pg_fatal("server does not support incremental backup"); + + /* Open the file. */ + fd = open(incremental_manifest, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + pg_fatal("could not open file \"%s\": %m", incremental_manifest); + + /* Tell the server what we want to do. */ + if (PQsendQuery(conn, "UPLOAD_MANIFEST") == 0) + pg_fatal("could not send replication command \"%s\": %s", + "UPLOAD_MANIFEST", PQerrorMessage(conn)); + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_COPY_IN) + { + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + pg_fatal("could not upload manifest: %s", + PQerrorMessage(conn)); + else + pg_fatal("could not upload manifest: unexpected status %s", + PQresStatus(PQresultStatus(res))); + } + + /* Loop, reading from the file and sending the data to the server. */ + while ((nbytes = read(fd, mbuf, sizeof mbuf)) > 0) + { + if (PQputCopyData(conn, mbuf, nbytes) < 0) + pg_fatal("could not send COPY data: %s", + PQerrorMessage(conn)); + } + + /* Bail out if we exited the loop due to an error. */ + if (nbytes < 0) + pg_fatal("could not read file \"%s\": %m", incremental_manifest); + + /* End the COPY operation. */ + if (PQputCopyEnd(conn, NULL) < 0) + pg_fatal("could not send end-of-COPY: %s", + PQerrorMessage(conn)); + + /* See whether the server is happy with what we sent. */ + res = PQgetResult(conn); + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + pg_fatal("could not upload manifest: %s", + PQerrorMessage(conn)); + else if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_fatal("could not upload manifest: unexpected status %s", + PQresStatus(PQresultStatus(res))); + + /* Consume ReadyForQuery message from server. */ + res = PQgetResult(conn); + if (res != NULL) + pg_fatal("unexpected extra result while sending manifest"); + + /* Add INCREMENTAL option to BASE_BACKUP command. */ + AppendPlainCommandOption(&buf, use_new_option_syntax, "INCREMENTAL"); + } + + /* + * Continue building up the options list for the BASE_BACKUP command. */ AppendStringCommandOption(&buf, use_new_option_syntax, "LABEL", label); if (estimatesize) @@ -1901,6 +1997,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, else basebkp = psprintf("BASE_BACKUP %s", buf.data); + /* OK, try to start the backup. */ if (PQsendQuery(conn, basebkp) == 0) pg_fatal("could not send replication command \"%s\": %s", "BASE_BACKUP", PQerrorMessage(conn)); @@ -2256,6 +2353,7 @@ main(int argc, char **argv) {"version", no_argument, NULL, 'V'}, {"pgdata", required_argument, NULL, 'D'}, {"format", required_argument, NULL, 'F'}, + {"incremental", required_argument, NULL, 'i'}, {"checkpoint", required_argument, NULL, 'c'}, {"create-slot", no_argument, NULL, 'C'}, {"max-rate", required_argument, NULL, 'r'}, @@ -2293,6 +2391,7 @@ main(int argc, char **argv) int option_index; char *compression_algorithm = "none"; char *compression_detail = NULL; + char *incremental_manifest = NULL; CompressionLocation compressloc = COMPRESS_LOCATION_UNSPECIFIED; pg_compress_specification client_compress; @@ -2317,7 +2416,7 @@ main(int argc, char **argv) atexit(cleanup_directories_atexit); - while ((c = getopt_long(argc, argv, "c:Cd:D:F:h:l:nNp:Pr:Rs:S:t:T:U:vwWX:zZ:", + while ((c = getopt_long(argc, argv, "c:Cd:D:F:h:i:l:nNp:Pr:Rs:S:t:T:U:vwWX:zZ:", long_options, &option_index)) != -1) { switch (c) @@ -2352,6 +2451,9 @@ main(int argc, char **argv) case 'h': dbhost = pg_strdup(optarg); break; + case 'i': + incremental_manifest = pg_strdup(optarg); + break; case 'l': label = pg_strdup(optarg); break; @@ -2765,7 +2867,7 @@ main(int argc, char **argv) } BaseBackup(compression_algorithm, compression_detail, compressloc, - &client_compress); + &client_compress, incremental_manifest); success = true; return 0; diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index b9f5e1266b4..bf765291e7d 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -223,10 +223,10 @@ SKIP: "check backup dir permissions"); } -# Only archive_status directory should be copied in pg_wal/. +# Only archive_status and summaries directories should be copied in pg_wal/. is_deeply( [ sort(slurp_dir("$tempdir/backup/pg_wal/")) ], - [ sort qw(. .. archive_status) ], + [ sort qw(. .. archive_status summaries) ], 'no WAL files copied'); # Contents of these directories should not be copied. diff --git a/src/bin/pg_combinebackup/.gitignore b/src/bin/pg_combinebackup/.gitignore new file mode 100644 index 00000000000..d7e617438c4 --- /dev/null +++ b/src/bin/pg_combinebackup/.gitignore @@ -0,0 +1 @@ +pg_combinebackup diff --git a/src/bin/pg_combinebackup/Makefile b/src/bin/pg_combinebackup/Makefile new file mode 100644 index 00000000000..78ba05e624b --- /dev/null +++ b/src/bin/pg_combinebackup/Makefile @@ -0,0 +1,52 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/bin/pg_combinebackup +# +# Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/bin/pg_combinebackup/Makefile +# +#------------------------------------------------------------------------- + +PGFILEDESC = "pg_combinebackup - combine incremental backups" +PGAPPICON=win32 + +subdir = src/bin/pg_combinebackup +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) +LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils + +OBJS = \ + $(WIN32RES) \ + pg_combinebackup.o \ + backup_label.o \ + copy_file.o \ + load_manifest.o \ + reconstruct.o \ + write_manifest.o + +all: pg_combinebackup + +pg_combinebackup: $(OBJS) | submake-libpgport submake-libpgfeutils + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +install: all installdirs + $(INSTALL_PROGRAM) pg_combinebackup$(X) '$(DESTDIR)$(bindir)/pg_combinebackup$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_combinebackup$(X)' + +clean distclean maintainer-clean: + rm -f pg_combinebackup$(X) $(OBJS) + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/bin/pg_combinebackup/backup_label.c b/src/bin/pg_combinebackup/backup_label.c new file mode 100644 index 00000000000..922e00854d6 --- /dev/null +++ b/src/bin/pg_combinebackup/backup_label.c @@ -0,0 +1,283 @@ +/*------------------------------------------------------------------------- + * + * Read and manipulate backup label files + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/backup_label.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <unistd.h> + +#include "access/xlogdefs.h" +#include "backup_label.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "write_manifest.h" + +static int get_eol_offset(StringInfo buf); +static bool line_starts_with(char *s, char *e, char *match, char **sout); +static bool parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c); +static bool parse_tli(char *s, char *e, TimeLineID *tli); + +/* + * Parse a backup label file, starting at buf->cursor. + * + * We expect to find a START WAL LOCATION line, followed by a LSN, followed + * by a space; the resulting LSN is stored into *start_lsn. + * + * We expect to find a START TIMELINE line, followed by a TLI, followed by + * a newline; the resulting TLI is stored into *start_tli. + * + * We expect to find either both INCREMENTAL FROM LSN and INCREMENTAL FROM TLI + * or neither. If these are found, they should be followed by an LSN or TLI + * respectively and then by a newline, and the values will be stored into + * *previous_lsn and *previous_tli, respectively. + * + * Other lines in the provided backup_label data are ignored. filename is used + * for error reporting; errors are fatal. + */ +void +parse_backup_label(char *filename, StringInfo buf, + TimeLineID *start_tli, XLogRecPtr *start_lsn, + TimeLineID *previous_tli, XLogRecPtr *previous_lsn) +{ + int found = 0; + + *start_tli = 0; + *start_lsn = InvalidXLogRecPtr; + *previous_tli = 0; + *previous_lsn = InvalidXLogRecPtr; + + while (buf->cursor < buf->len) + { + char *s = &buf->data[buf->cursor]; + int eo = get_eol_offset(buf); + char *e = &buf->data[eo]; + char *c; + + if (line_starts_with(s, e, "START WAL LOCATION: ", &s)) + { + if (!parse_lsn(s, e, start_lsn, &c)) + pg_fatal("%s: could not parse %s", + filename, "START WAL LOCATION"); + if (c >= e || *c != ' ') + pg_fatal("%s: improper terminator for %s", + filename, "START WAL LOCATION"); + found |= 1; + } + else if (line_starts_with(s, e, "START TIMELINE: ", &s)) + { + if (!parse_tli(s, e, start_tli)) + pg_fatal("%s: could not parse TLI for %s", + filename, "START TIMELINE"); + if (*start_tli == 0) + pg_fatal("%s: invalid TLI", filename); + found |= 2; + } + else if (line_starts_with(s, e, "INCREMENTAL FROM LSN: ", &s)) + { + if (!parse_lsn(s, e, previous_lsn, &c)) + pg_fatal("%s: could not parse %s", + filename, "INCREMENTAL FROM LSN"); + if (c >= e || *c != '\n') + pg_fatal("%s: improper terminator for %s", + filename, "INCREMENTAL FROM LSN"); + found |= 4; + } + else if (line_starts_with(s, e, "INCREMENTAL FROM TLI: ", &s)) + { + if (!parse_tli(s, e, previous_tli)) + pg_fatal("%s: could not parse %s", + filename, "INCREMENTAL FROM TLI"); + if (*previous_tli == 0) + pg_fatal("%s: invalid TLI", filename); + found |= 8; + } + + buf->cursor = eo; + } + + if ((found & 1) == 0) + pg_fatal("%s: could not find %s", filename, "START WAL LOCATION"); + if ((found & 2) == 0) + pg_fatal("%s: could not find %s", filename, "START TIMELINE"); + if ((found & 4) != 0 && (found & 8) == 0) + pg_fatal("%s: %s requires %s", filename, + "INCREMENTAL FROM LSN", "INCREMENTAL FROM TLI"); + if ((found & 8) != 0 && (found & 4) == 0) + pg_fatal("%s: %s requires %s", filename, + "INCREMENTAL FROM TLI", "INCREMENTAL FROM LSN"); +} + +/* + * Write a backup label file to the output directory. + * + * This will be identical to the provided backup_label file, except that the + * INCREMENTAL FROM LSN and INCREMENTAL FROM TLI lines will be omitted. + * + * The new file will be checksummed using the specified algorithm. If + * mwriter != NULL, it will be added to the manifest. + */ +void +write_backup_label(char *output_directory, StringInfo buf, + pg_checksum_type checksum_type, manifest_writer *mwriter) +{ + char output_filename[MAXPGPATH]; + int output_fd; + pg_checksum_context checksum_ctx; + uint8 checksum_payload[PG_CHECKSUM_MAX_LENGTH]; + int checksum_length; + + pg_checksum_init(&checksum_ctx, checksum_type); + + snprintf(output_filename, MAXPGPATH, "%s/backup_label", output_directory); + + if ((output_fd = open(output_filename, + O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + while (buf->cursor < buf->len) + { + char *s = &buf->data[buf->cursor]; + int eo = get_eol_offset(buf); + char *e = &buf->data[eo]; + + if (!line_starts_with(s, e, "INCREMENTAL FROM LSN: ", NULL) && + !line_starts_with(s, e, "INCREMENTAL FROM TLI: ", NULL)) + { + ssize_t wb; + + wb = write(output_fd, s, e - s); + if (wb != e - s) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, (int) (e - s)); + } + if (pg_checksum_update(&checksum_ctx, (uint8 *) s, e - s) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + buf->cursor = eo; + } + + if (close(output_fd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); + + checksum_length = pg_checksum_final(&checksum_ctx, checksum_payload); + + if (mwriter != NULL) + { + struct stat sb; + + /* + * We could track the length ourselves, but must stat() to get the + * mtime. + */ + if (stat(output_filename, &sb) < 0) + pg_fatal("could not stat file \"%s\": %m", output_filename); + add_file_to_manifest(mwriter, "backup_label", sb.st_size, + sb.st_mtime, checksum_type, + checksum_length, checksum_payload); + } +} + +/* + * Return the offset at which the next line in the buffer starts, or there + * is none, the offset at which the buffer ends. + * + * The search begins at buf->cursor. + */ +static int +get_eol_offset(StringInfo buf) +{ + int eo = buf->cursor; + + while (eo < buf->len) + { + if (buf->data[eo] == '\n') + return eo + 1; + ++eo; + } + + return eo; +} + +/* + * Test whether the line that runs from s to e (inclusive of *s, but not + * inclusive of *e) starts with the match string provided, and return true + * or false according to whether or not this is the case. + * + * If the function returns true and if *sout != NULL, stores a pointer to the + * byte following the match into *sout. + */ +static bool +line_starts_with(char *s, char *e, char *match, char **sout) +{ + while (s < e && *match != '\0' && *s == *match) + ++s, ++match; + + if (*match == '\0' && sout != NULL) + *sout = s; + + return (*match == '\0'); +} + +/* + * Parse an LSN starting at s and not stopping at or before e. The return value + * is true on success and otherwise false. On success, stores the result into + * *lsn and sets *c to the first character that is not part of the LSN. + */ +static bool +parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c) +{ + char save = *e; + int nchars; + bool success; + unsigned hi; + unsigned lo; + + *e = '\0'; + success = (sscanf(s, "%X/%X%n", &hi, &lo, &nchars) == 2); + *e = save; + + if (success) + { + *lsn = ((XLogRecPtr) hi) << 32 | (XLogRecPtr) lo; + *c = s + nchars; + } + + return success; +} + +/* + * Parse a TLI starting at s and stopping at or before e. The return value is + * true on success and otherwise false. On success, stores the result into + * *tli. If the first character that is not part of the TLI is anything other + * than a newline, that is deemed a failure. + */ +static bool +parse_tli(char *s, char *e, TimeLineID *tli) +{ + char save = *e; + int nchars; + bool success; + + *e = '\0'; + success = (sscanf(s, "%u%n", tli, &nchars) == 1); + *e = save; + + if (success && s[nchars] != '\n') + success = false; + + return success; +} diff --git a/src/bin/pg_combinebackup/backup_label.h b/src/bin/pg_combinebackup/backup_label.h new file mode 100644 index 00000000000..3af7ea274cc --- /dev/null +++ b/src/bin/pg_combinebackup/backup_label.h @@ -0,0 +1,30 @@ +/*------------------------------------------------------------------------- + * + * Read and manipulate backup label files + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/backup_label.h + * + *------------------------------------------------------------------------- + */ +#ifndef BACKUP_LABEL_H +#define BACKUP_LABEL_H + +#include "access/xlogdefs.h" +#include "common/checksum_helper.h" +#include "lib/stringinfo.h" + +struct manifest_writer; + +extern void parse_backup_label(char *filename, StringInfo buf, + TimeLineID *start_tli, + XLogRecPtr *start_lsn, + TimeLineID *previous_tli, + XLogRecPtr *previous_lsn); +extern void write_backup_label(char *output_directory, StringInfo buf, + pg_checksum_type checksum_type, + struct manifest_writer *mwriter); + +#endif /* BACKUP_LABEL_H */ diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c new file mode 100644 index 00000000000..40a55e30878 --- /dev/null +++ b/src/bin/pg_combinebackup/copy_file.c @@ -0,0 +1,169 @@ +/* + * Copy entire files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/copy_file.h + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#ifdef HAVE_COPYFILE_H +#include <copyfile.h> +#endif +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "common/file_perm.h" +#include "common/logging.h" +#include "copy_file.h" + +static void copy_file_blocks(const char *src, const char *dst, + pg_checksum_context *checksum_ctx); + +#ifdef WIN32 +static void copy_file_copyfile(const char *src, const char *dst); +#endif + +/* + * Copy a regular file, optionally computing a checksum, and emitting + * appropriate debug messages. But if we're in dry-run mode, then just emit + * the messages and don't copy anything. + */ +void +copy_file(const char *src, const char *dst, + pg_checksum_context *checksum_ctx, bool dry_run) +{ + /* + * In dry-run mode, we don't actually copy anything, nor do we read any + * data from the source file, but we do verify that we can open it. + */ + if (dry_run) + { + int fd; + + if ((fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open \"%s\": %m", src); + if (close(fd) < 0) + pg_fatal("could not close \"%s\": %m", src); + } + + /* + * If we don't need to compute a checksum, then we can use any special + * operating system primitives that we know about to copy the file; this + * may be quicker than a naive block copy. + */ + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + { + char *strategy_name = NULL; + void (*strategy_implementation) (const char *, const char *) = NULL; + +#ifdef WIN32 + strategy_name = "CopyFile"; + strategy_implementation = copy_file_copyfile; +#endif + + if (strategy_name != NULL) + { + if (dry_run) + pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + else + { + pg_log_debug("copying \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + (*strategy_implementation) (src, dst); + } + return; + } + } + + /* + * Fall back to the simple approach of reading and writing all the blocks, + * feeding them into the checksum context as we go. + */ + if (dry_run) + { + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + pg_log_debug("would copy \"%s\" to \"%s\"", + src, dst); + else + pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s", + src, dst, pg_checksum_type_name(checksum_ctx->type)); + } + else + { + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + pg_log_debug("copying \"%s\" to \"%s\"", + src, dst); + else + pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s", + src, dst, pg_checksum_type_name(checksum_ctx->type)); + copy_file_blocks(src, dst, checksum_ctx); + } +} + +/* + * Copy a file block by block, and optionally compute a checksum as we go. + */ +static void +copy_file_blocks(const char *src, const char *dst, + pg_checksum_context *checksum_ctx) +{ + int src_fd; + int dest_fd; + uint8 *buffer; + const int buffer_size = 50 * BLCKSZ; + ssize_t rb; + unsigned offset = 0; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", src); + + if ((dest_fd = open(dst, O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", dst); + + buffer = pg_malloc(buffer_size); + + while ((rb = read(src_fd, buffer, buffer_size)) > 0) + { + ssize_t wb; + + if ((wb = write(dest_fd, buffer, rb)) != rb) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", dst); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes at offset %u", + dst, (int) wb, (int) rb, offset); + } + + if (pg_checksum_update(checksum_ctx, buffer, rb) < 0) + pg_fatal("could not update checksum of file \"%s\"", dst); + + offset += rb; + } + + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", dst); + + pg_free(buffer); + close(src_fd); + close(dest_fd); +} + +#ifdef WIN32 +static void +copy_file_copyfile(const char *src, const char *dst) +{ + if (CopyFile(src, dst, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("could not copy \"%s\" to \"%s\": %m", src, dst); + } +} +#endif /* WIN32 */ diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h new file mode 100644 index 00000000000..031030bacbf --- /dev/null +++ b/src/bin/pg_combinebackup/copy_file.h @@ -0,0 +1,19 @@ +/* + * Copy entire files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/copy_file.h + * + *------------------------------------------------------------------------- + */ +#ifndef COPY_FILE_H +#define COPY_FILE_H + +#include "common/checksum_helper.h" + +extern void copy_file(const char *src, const char *dst, + pg_checksum_context *checksum_ctx, bool dry_run); + +#endif /* COPY_FILE_H */ diff --git a/src/bin/pg_combinebackup/load_manifest.c b/src/bin/pg_combinebackup/load_manifest.c new file mode 100644 index 00000000000..ad32323c9c2 --- /dev/null +++ b/src/bin/pg_combinebackup/load_manifest.c @@ -0,0 +1,245 @@ +/*------------------------------------------------------------------------- + * + * Load data from a backup manifest into memory. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/load_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include <sys/stat.h> +#include <unistd.h> + +#include "common/hashfn.h" +#include "common/logging.h" +#include "common/parse_manifest.h" +#include "load_manifest.h" + +/* + * For efficiency, we'd like our hash table containing information about the + * manifest to start out with approximately the correct number of entries. + * There's no way to know the exact number of entries without reading the whole + * file, but we can get an estimate by dividing the file size by the estimated + * number of bytes per line. + * + * This could be off by about a factor of two in either direction, because the + * checksum algorithm has a big impact on the line lengths; e.g. a SHA512 + * checksum is 128 hex bytes, whereas a CRC-32C value is only 8, and there + * might be no checksum at all. + */ +#define ESTIMATED_BYTES_PER_MANIFEST_LINE 100 + +/* + * Define a hash table which we can use to store information about the files + * mentioned in the backup manifest. + */ +static uint32 hash_string_pointer(char *s); +#define SH_PREFIX manifest_files +#define SH_ELEMENT_TYPE manifest_file +#define SH_KEY_TYPE char * +#define SH_KEY pathname +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE extern +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DEFINE +#include "lib/simplehash.h" + +static void combinebackup_per_file_cb(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void combinebackup_per_wal_range_cb(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +static void report_manifest_error(JsonManifestParseContext *context, + const char *fmt,...) + pg_attribute_printf(2, 3) pg_attribute_noreturn(); + +/* + * Load backup_manifest files from an array of backups and produces an array + * of manifest_data objects. + * + * NB: Since load_backup_manifest() can return NULL, the resulting array could + * contain NULL entries. + */ +manifest_data ** +load_backup_manifests(int n_backups, char **backup_directories) +{ + manifest_data **result; + int i; + + result = pg_malloc(sizeof(manifest_data *) * n_backups); + for (i = 0; i < n_backups; ++i) + result[i] = load_backup_manifest(backup_directories[i]); + + return result; +} + +/* + * Parse the backup_manifest file in the named backup directory. Construct a + * hash table with information about all the files it mentions, and a linked + * list of all the WAL ranges it mentions. + * + * If the backup_manifest file simply doesn't exist, logs a warning and returns + * NULL. Any other error, or any error parsing the contents of the file, is + * fatal. + */ +manifest_data * +load_backup_manifest(char *backup_directory) +{ + char pathname[MAXPGPATH]; + int fd; + struct stat statbuf; + off_t estimate; + uint32 initial_size; + manifest_files_hash *ht; + char *buffer; + int rc; + JsonManifestParseContext context; + manifest_data *result; + + /* Open the manifest file. */ + snprintf(pathname, MAXPGPATH, "%s/backup_manifest", backup_directory); + if ((fd = open(pathname, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (errno == ENOENT) + { + pg_log_warning("\"%s\" does not exist", pathname); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", pathname); + } + + /* Figure out how big the manifest is. */ + if (fstat(fd, &statbuf) != 0) + pg_fatal("could not stat file \"%s\": %m", pathname); + + /* Guess how large to make the hash table based on the manifest size. */ + estimate = statbuf.st_size / ESTIMATED_BYTES_PER_MANIFEST_LINE; + initial_size = Min(PG_UINT32_MAX, Max(estimate, 256)); + + /* Create the hash table. */ + ht = manifest_files_create(initial_size, NULL); + + /* + * Slurp in the whole file. + * + * This is not ideal, but there's currently no way to get pg_parse_json() + * to perform incremental parsing. + */ + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", pathname); + else + pg_fatal("could not read file \"%s\": read %d of %lld", + pathname, rc, (long long int) statbuf.st_size); + } + + /* Close the manifest file. */ + close(fd); + + /* Parse the manifest. */ + result = pg_malloc0(sizeof(manifest_data)); + result->files = ht; + context.private_data = result; + context.per_file_cb = combinebackup_per_file_cb; + context.per_wal_range_cb = combinebackup_per_wal_range_cb; + context.error_cb = report_manifest_error; + json_parse_manifest(&context, buffer, statbuf.st_size); + + /* All done. */ + pfree(buffer); + return result; +} + +/* + * Report an error while parsing the manifest. + * + * We consider all such errors to be fatal errors. The manifest parser + * expects this function not to return. + */ +static void +report_manifest_error(JsonManifestParseContext *context, const char *fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + pg_log_generic_v(PG_LOG_ERROR, PG_LOG_PRIMARY, gettext(fmt), ap); + va_end(ap); + + exit(1); +} + +/* + * Record details extracted from the backup manifest for one file. + */ +static void +combinebackup_per_file_cb(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, uint8 *checksum_payload) +{ + manifest_data *manifest = context->private_data; + manifest_file *m; + bool found; + + /* Make a new entry in the hash table for this file. */ + m = manifest_files_insert(manifest->files, pathname, &found); + if (found) + pg_fatal("duplicate path name in backup manifest: \"%s\"", pathname); + + /* Initialize the entry. */ + m->size = size; + m->checksum_type = checksum_type; + m->checksum_length = checksum_length; + m->checksum_payload = checksum_payload; +} + +/* + * Record details extracted from the backup manifest for one WAL range. + */ +static void +combinebackup_per_wal_range_cb(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + manifest_data *manifest = context->private_data; + manifest_wal_range *range; + + /* Allocate and initialize a struct describing this WAL range. */ + range = palloc(sizeof(manifest_wal_range)); + range->tli = tli; + range->start_lsn = start_lsn; + range->end_lsn = end_lsn; + range->prev = manifest->last_wal_range; + range->next = NULL; + + /* Add it to the end of the list. */ + if (manifest->first_wal_range == NULL) + manifest->first_wal_range = range; + else + manifest->last_wal_range->next = range; + manifest->last_wal_range = range; +} + +/* + * Helper function for manifest_files hash table. + */ +static uint32 +hash_string_pointer(char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} diff --git a/src/bin/pg_combinebackup/load_manifest.h b/src/bin/pg_combinebackup/load_manifest.h new file mode 100644 index 00000000000..2bfeeff1560 --- /dev/null +++ b/src/bin/pg_combinebackup/load_manifest.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------- + * + * Load data from a backup manifest into memory. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/load_manifest.h + * + *------------------------------------------------------------------------- + */ +#ifndef LOAD_MANIFEST_H +#define LOAD_MANIFEST_H + +#include "access/xlogdefs.h" +#include "common/checksum_helper.h" + +/* + * Each file described by the manifest file is parsed to produce an object + * like this. + */ +typedef struct manifest_file +{ + uint32 status; /* hash status */ + char *pathname; + size_t size; + pg_checksum_type checksum_type; + int checksum_length; + uint8 *checksum_payload; +} manifest_file; + +#define SH_PREFIX manifest_files +#define SH_ELEMENT_TYPE manifest_file +#define SH_KEY_TYPE char * +#define SH_SCOPE extern +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * Each WAL range described by the manifest file is parsed to produce an + * object like this. + */ +typedef struct manifest_wal_range +{ + TimeLineID tli; + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + struct manifest_wal_range *next; + struct manifest_wal_range *prev; +} manifest_wal_range; + +/* + * All the data parsed from a backup_manifest file. + */ +typedef struct manifest_data +{ + manifest_files_hash *files; + manifest_wal_range *first_wal_range; + manifest_wal_range *last_wal_range; +} manifest_data; + +extern manifest_data *load_backup_manifest(char *backup_directory); +extern manifest_data **load_backup_manifests(int n_backups, + char **backup_directories); + +#endif /* LOAD_MANIFEST_H */ diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build new file mode 100644 index 00000000000..e402d6f50e7 --- /dev/null +++ b/src/bin/pg_combinebackup/meson.build @@ -0,0 +1,38 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pg_combinebackup_sources = files( + 'pg_combinebackup.c', + 'backup_label.c', + 'copy_file.c', + 'load_manifest.c', + 'reconstruct.c', + 'write_manifest.c', +) + +if host_system == 'windows' + pg_combinebackup_sources += rc_bin_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_combinebackup', + '--FILEDESC', 'pg_combinebackup - combine incremental backups',]) +endif + +pg_combinebackup = executable('pg_combinebackup', + pg_combinebackup_sources, + dependencies: [frontend_code], + kwargs: default_bin_args, +) +bin_targets += pg_combinebackup + +tests += { + 'name': 'pg_combinebackup', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'tests': [ + 't/001_basic.pl', + 't/002_compare_backups.pl', + 't/003_timeline.pl', + 't/004_manifest.pl', + 't/005_integrity.pl', + ], + } +} diff --git a/src/bin/pg_combinebackup/nls.mk b/src/bin/pg_combinebackup/nls.mk new file mode 100644 index 00000000000..c8e59d1d00d --- /dev/null +++ b/src/bin/pg_combinebackup/nls.mk @@ -0,0 +1,11 @@ +# src/bin/pg_combinebackup/nls.mk +CATALOG_NAME = pg_combinebackup +GETTEXT_FILES = $(FRONTEND_COMMON_GETTEXT_FILES) \ + backup_label.c \ + copy_file.c \ + load_manifest.c \ + pg_combinebackup.c \ + reconstruct.c \ + write_manifest.c +GETTEXT_TRIGGERS = $(FRONTEND_COMMON_GETTEXT_TRIGGERS) +GETTEXT_FLAGS = $(FRONTEND_COMMON_GETTEXT_FLAGS) diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c new file mode 100644 index 00000000000..85d3f4e5def --- /dev/null +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -0,0 +1,1284 @@ +/*------------------------------------------------------------------------- + * + * pg_combinebackup.c + * Combine incremental backups with prior backups. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/pg_combinebackup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <dirent.h> +#include <fcntl.h> +#include <limits.h> + +#include "backup_label.h" +#include "common/blkreftable.h" +#include "common/checksum_helper.h" +#include "common/controldata_utils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/logging.h" +#include "copy_file.h" +#include "fe_utils/option_utils.h" +#include "lib/stringinfo.h" +#include "load_manifest.h" +#include "getopt_long.h" +#include "reconstruct.h" +#include "write_manifest.h" + +/* Incremental file naming convention. */ +#define INCREMENTAL_PREFIX "INCREMENTAL." +#define INCREMENTAL_PREFIX_LENGTH (sizeof(INCREMENTAL_PREFIX) - 1) + +/* + * Tracking for directories that need to be removed, or have their contents + * removed, if the operation fails. + */ +typedef struct cb_cleanup_dir +{ + char *target_path; + bool rmtopdir; + struct cb_cleanup_dir *next; +} cb_cleanup_dir; + +/* + * Stores a tablespace mapping provided using -T, --tablespace-mapping. + */ +typedef struct cb_tablespace_mapping +{ + char old_dir[MAXPGPATH]; + char new_dir[MAXPGPATH]; + struct cb_tablespace_mapping *next; +} cb_tablespace_mapping; + +/* + * Stores data parsed from all command-line options. + */ +typedef struct cb_options +{ + bool debug; + char *output; + bool dry_run; + bool no_sync; + cb_tablespace_mapping *tsmappings; + pg_checksum_type manifest_checksums; + bool no_manifest; + DataDirSyncMethod sync_method; +} cb_options; + +/* + * Data about a tablespace. + * + * Every normal tablespace needs a tablespace mapping, but in-place tablespaces + * don't, so the list of tablespaces can contain more entries than the list of + * tablespace mappings. + */ +typedef struct cb_tablespace +{ + Oid oid; + bool in_place; + char old_dir[MAXPGPATH]; + char new_dir[MAXPGPATH]; + struct cb_tablespace *next; +} cb_tablespace; + +/* Directories to be removed if we exit uncleanly. */ +cb_cleanup_dir *cleanup_dir_list = NULL; + +static void add_tablespace_mapping(cb_options *opt, char *arg); +static StringInfo check_backup_label_files(int n_backups, char **backup_dirs); +static void check_control_files(int n_backups, char **backup_dirs); +static void check_input_dir_permissions(char *dir); +static void cleanup_directories_atexit(void); +static void create_output_directory(char *dirname, cb_options *opt); +static void help(const char *progname); +static bool parse_oid(char *s, Oid *result); +static void process_directory_recursively(Oid tsoid, + char *input_directory, + char *output_directory, + char *relative_path, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + manifest_writer *mwriter, + cb_options *opt); +static int read_pg_version_file(char *directory); +static void remember_to_cleanup_directory(char *target_path, bool rmtopdir); +static void reset_directory_cleanup_list(void); +static cb_tablespace *scan_for_existing_tablespaces(char *pathname, + cb_options *opt); +static void slurp_file(int fd, char *filename, StringInfo buf, int maxlen); + +/* + * Main program. + */ +int +main(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"debug", no_argument, NULL, 'd'}, + {"dry-run", no_argument, NULL, 'n'}, + {"no-sync", no_argument, NULL, 'N'}, + {"output", required_argument, NULL, 'o'}, + {"tablespace-mapping", no_argument, NULL, 'T'}, + {"manifest-checksums", required_argument, NULL, 1}, + {"no-manifest", no_argument, NULL, 2}, + {"sync-method", required_argument, NULL, 3}, + {NULL, 0, NULL, 0} + }; + + const char *progname; + char *last_input_dir; + int optindex; + int c; + int n_backups; + int n_prior_backups; + int version; + char **prior_backup_dirs; + cb_options opt; + cb_tablespace *tablespaces; + cb_tablespace *ts; + StringInfo last_backup_label; + manifest_data **manifests; + manifest_writer *mwriter; + + pg_logging_init(argv[0]); + progname = get_progname(argv[0]); + handle_help_version_opts(argc, argv, progname, help); + + memset(&opt, 0, sizeof(opt)); + opt.manifest_checksums = CHECKSUM_TYPE_CRC32C; + opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC; + + /* process command-line options */ + while ((c = getopt_long(argc, argv, "dnNPo:T:", + long_options, &optindex)) != -1) + { + switch (c) + { + case 'd': + opt.debug = true; + pg_logging_increase_verbosity(); + break; + case 'n': + opt.dry_run = true; + break; + case 'N': + opt.no_sync = true; + break; + case 'o': + opt.output = optarg; + break; + case 'T': + add_tablespace_mapping(&opt, optarg); + break; + case 1: + if (!pg_checksum_parse_type(optarg, + &opt.manifest_checksums)) + pg_fatal("unrecognized checksum algorithm: \"%s\"", + optarg); + break; + case 2: + opt.no_manifest = true; + break; + case 3: + if (!parse_sync_method(optarg, &opt.sync_method)) + exit(1); + break; + default: + /* getopt_long already emitted a complaint */ + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + } + + if (optind >= argc) + { + pg_log_error("%s: no input directories specified", progname); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + if (opt.output == NULL) + pg_fatal("no output directory specified"); + + /* If no manifest is needed, no checksums are needed, either. */ + if (opt.no_manifest) + opt.manifest_checksums = CHECKSUM_TYPE_NONE; + + /* Read the server version from the final backup. */ + version = read_pg_version_file(argv[argc - 1]); + + /* Sanity-check control files. */ + n_backups = argc - optind; + check_control_files(n_backups, argv + optind); + + /* Sanity-check backup_label files, and get the contents of the last one. */ + last_backup_label = check_backup_label_files(n_backups, argv + optind); + + /* + * We'll need the pathnames to the prior backups. By "prior" we mean all + * but the last one listed on the command line. + */ + n_prior_backups = argc - optind - 1; + prior_backup_dirs = argv + optind; + + /* Load backup manifests. */ + manifests = load_backup_manifests(n_backups, prior_backup_dirs); + + /* Figure out which tablespaces are going to be included in the output. */ + last_input_dir = argv[argc - 1]; + check_input_dir_permissions(last_input_dir); + tablespaces = scan_for_existing_tablespaces(last_input_dir, &opt); + + /* + * Create output directories. + * + * We create one output directory for the main data directory plus one for + * each non-in-place tablespace. create_output_directory() will arrange + * for those directories to be cleaned up on failure. In-place tablespaces + * aren't handled at this stage because they're located beneath the main + * output directory, and thus the cleanup of that directory will get rid + * of them. Plus, the pg_tblspc directory that needs to contain them + * doesn't exist yet. + */ + atexit(cleanup_directories_atexit); + create_output_directory(opt.output, &opt); + for (ts = tablespaces; ts != NULL; ts = ts->next) + if (!ts->in_place) + create_output_directory(ts->new_dir, &opt); + + /* If we need to write a backup_manifest, prepare to do so. */ + if (!opt.dry_run && !opt.no_manifest) + { + mwriter = create_manifest_writer(opt.output); + + /* + * Verify that we have a backup manifest for the final backup; else we + * won't have the WAL ranges for the resulting manifest. + */ + if (manifests[n_prior_backups] == NULL) + pg_fatal("can't generate a manifest because no manifest is available for the final input backup"); + } + else + mwriter = NULL; + + /* Write backup label into output directory. */ + if (opt.dry_run) + pg_log_debug("would generate \"%s/backup_label\"", opt.output); + else + { + pg_log_debug("generating \"%s/backup_label\"", opt.output); + last_backup_label->cursor = 0; + write_backup_label(opt.output, last_backup_label, + opt.manifest_checksums, mwriter); + } + + /* Process everything that's not part of a user-defined tablespace. */ + pg_log_debug("processing backup directory \"%s\"", last_input_dir); + process_directory_recursively(InvalidOid, last_input_dir, opt.output, + NULL, n_prior_backups, prior_backup_dirs, + manifests, mwriter, &opt); + + /* Process user-defined tablespaces. */ + for (ts = tablespaces; ts != NULL; ts = ts->next) + { + pg_log_debug("processing tablespace directory \"%s\"", ts->old_dir); + + /* + * If it's a normal tablespace, we need to set up a symbolic link from + * pg_tblspc/${OID} to the target directory; if it's an in-place + * tablespace, we need to create a directory at pg_tblspc/${OID}. + */ + if (!ts->in_place) + { + char linkpath[MAXPGPATH]; + + snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output, + ts->oid); + + if (opt.dry_run) + pg_log_debug("would create symbolic link from \"%s\" to \"%s\"", + linkpath, ts->new_dir); + else + { + pg_log_debug("creating symbolic link from \"%s\" to \"%s\"", + linkpath, ts->new_dir); + if (symlink(ts->new_dir, linkpath) != 0) + pg_fatal("could not create symbolic link from \"%s\" to \"%s\": %m", + linkpath, ts->new_dir); + } + } + else + { + if (opt.dry_run) + pg_log_debug("would create directory \"%s\"", ts->new_dir); + else + { + pg_log_debug("creating directory \"%s\"", ts->new_dir); + if (pg_mkdir_p(ts->new_dir, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", + ts->new_dir); + } + } + + /* OK, now handle the directory contents. */ + process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir, + NULL, n_prior_backups, prior_backup_dirs, + manifests, mwriter, &opt); + } + + /* Finalize the backup_manifest, if we're generating one. */ + if (mwriter != NULL) + finalize_manifest(mwriter, + manifests[n_prior_backups]->first_wal_range); + + /* fsync that output directory unless we've been told not to do so */ + if (!opt.no_sync) + { + if (opt.dry_run) + pg_log_debug("would recursively fsync \"%s\"", opt.output); + else + { + pg_log_debug("recursively fsyncing \"%s\"", opt.output); + sync_pgdata(opt.output, version * 10000, opt.sync_method); + } + } + + /* It's a success, so don't remove the output directories. */ + reset_directory_cleanup_list(); + exit(0); +} + +/* + * Process the option argument for the -T, --tablespace-mapping switch. + */ +static void +add_tablespace_mapping(cb_options *opt, char *arg) +{ + cb_tablespace_mapping *tsmap = pg_malloc0(sizeof(cb_tablespace_mapping)); + char *dst; + char *dst_ptr; + char *arg_ptr; + + /* + * Basically, we just want to copy everything before the equals sign to + * tsmap->old_dir and everything afterwards to tsmap->new_dir, but if + * there's more or less than one equals sign, that's an error, and if + * there's an equals sign preceded by a backslash, don't treat it as a + * field separator but instead copy a literal equals sign. + */ + dst_ptr = dst = tsmap->old_dir; + for (arg_ptr = arg; *arg_ptr != '\0'; arg_ptr++) + { + if (dst_ptr - dst >= MAXPGPATH) + pg_fatal("directory name too long"); + + if (*arg_ptr == '\\' && *(arg_ptr + 1) == '=') + ; /* skip backslash escaping = */ + else if (*arg_ptr == '=' && (arg_ptr == arg || *(arg_ptr - 1) != '\\')) + { + if (tsmap->new_dir[0] != '\0') + pg_fatal("multiple \"=\" signs in tablespace mapping"); + else + dst = dst_ptr = tsmap->new_dir; + } + else + *dst_ptr++ = *arg_ptr; + } + if (!tsmap->old_dir[0] || !tsmap->new_dir[0]) + pg_fatal("invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"", arg); + + /* + * All tablespaces are created with absolute directories, so specifying a + * non-absolute path here would never match, possibly confusing users. + * + * In contrast to pg_basebackup, both the old and new directories are on + * the local machine, so the local machine's definition of an absolute + * path is the only relevant one. + */ + if (!is_absolute_path(tsmap->old_dir)) + pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + tsmap->old_dir); + + if (!is_absolute_path(tsmap->new_dir)) + pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + tsmap->new_dir); + + /* Canonicalize paths to avoid spurious failures when comparing. */ + canonicalize_path(tsmap->old_dir); + canonicalize_path(tsmap->new_dir); + + /* Add it to the list. */ + tsmap->next = opt->tsmappings; + opt->tsmappings = tsmap; +} + +/* + * Check that the backup_label files form a coherent backup chain, and return + * the contents of the backup_label file from the latest backup. + */ +static StringInfo +check_backup_label_files(int n_backups, char **backup_dirs) +{ + StringInfo buf = makeStringInfo(); + StringInfo lastbuf = buf; + int i; + TimeLineID check_tli = 0; + XLogRecPtr check_lsn = InvalidXLogRecPtr; + + /* Try to read each backup_label file in turn, last to first. */ + for (i = n_backups - 1; i >= 0; --i) + { + char pathbuf[MAXPGPATH]; + int fd; + TimeLineID start_tli; + TimeLineID previous_tli; + XLogRecPtr start_lsn; + XLogRecPtr previous_lsn; + + /* Open the backup_label file. */ + snprintf(pathbuf, MAXPGPATH, "%s/backup_label", backup_dirs[i]); + pg_log_debug("reading \"%s\"", pathbuf); + if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", pathbuf); + + /* + * Slurp the whole file into memory. + * + * The exact size limit that we impose here doesn't really matter -- + * most of what's supposed to be in the file is fixed size and quite + * short. However, the length of the backup_label is limited (at least + * by some parts of the code) to MAXGPATH, so include that value in + * the maximum length that we tolerate. + */ + slurp_file(fd, pathbuf, buf, 10000 + MAXPGPATH); + + /* Close the file. */ + if (close(fd) != 0) + pg_fatal("could not close \"%s\": %m", pathbuf); + + /* Parse the file contents. */ + parse_backup_label(pathbuf, buf, &start_tli, &start_lsn, + &previous_tli, &previous_lsn); + + /* + * Sanity checks. + * + * XXX. It's actually not required that start_lsn == check_lsn. It + * would be OK if start_lsn > check_lsn provided that start_lsn is + * less than or equal to the relevant switchpoint. But at the moment + * we don't have that information. + */ + if (i > 0 && previous_tli == 0) + pg_fatal("backup at \"%s\" is a full backup, but only the first backup should be a full backup", + backup_dirs[i]); + if (i == 0 && previous_tli != 0) + pg_fatal("backup at \"%s\" is an incremental backup, but the first backup should be a full backup", + backup_dirs[i]); + if (i < n_backups - 1 && start_tli != check_tli) + pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u", + backup_dirs[i], start_tli, check_tli); + if (i < n_backups - 1 && start_lsn != check_lsn) + pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X", + backup_dirs[i], + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(check_lsn)); + check_tli = previous_tli; + check_lsn = previous_lsn; + + /* + * The last backup label in the chain needs to be saved for later use, + * while the others are only needed within this loop. + */ + if (lastbuf == buf) + buf = makeStringInfo(); + else + resetStringInfo(buf); + } + + /* Free memory that we don't need any more. */ + if (lastbuf != buf) + { + pfree(buf->data); + pfree(buf); + } + + /* + * Return the data from the first backup_info that we read (which is the + * backup_label from the last directory specified on the command line). + */ + return lastbuf; +} + +/* + * Sanity check control files. + */ +static void +check_control_files(int n_backups, char **backup_dirs) +{ + int i; + uint64 system_identifier = 0; /* placate compiler */ + + /* Try to read each control file in turn, last to first. */ + for (i = n_backups - 1; i >= 0; --i) + { + ControlFileData *control_file; + bool crc_ok; + char *controlpath; + + controlpath = psprintf("%s/%s", backup_dirs[i], "global/pg_control"); + pg_log_debug("reading \"%s\"", controlpath); + control_file = get_controlfile(backup_dirs[i], &crc_ok); + + /* Control file contents not meaningful if CRC is bad. */ + if (!crc_ok) + pg_fatal("%s: crc is incorrect", controlpath); + + /* Can't interpret control file if not current version. */ + if (control_file->pg_control_version != PG_CONTROL_VERSION) + pg_fatal("%s: unexpected control file version", + controlpath); + + /* System identifiers should all match. */ + if (i == n_backups - 1) + system_identifier = control_file->system_identifier; + else if (system_identifier != control_file->system_identifier) + pg_fatal("%s: expected system identifier %llu, but found %llu", + controlpath, (unsigned long long) system_identifier, + (unsigned long long) control_file->system_identifier); + + /* Release memory. */ + pfree(control_file); + pfree(controlpath); + } + + /* + * If debug output is enabled, make a note of the system identifier that + * we found in all of the relevant control files. + */ + pg_log_debug("system identifier is %llu", + (unsigned long long) system_identifier); +} + +/* + * Set default permissions for new files and directories based on the + * permissions of the given directory. The intent here is that the output + * directory should use the same permissions scheme as the final input + * directory. + */ +static void +check_input_dir_permissions(char *dir) +{ + struct stat st; + + if (stat(dir, &st) != 0) + pg_fatal("could not stat \"%s\": %m", dir); + + SetDataDirectoryCreatePerm(st.st_mode); +} + +/* + * Clean up output directories before exiting. + */ +static void +cleanup_directories_atexit(void) +{ + while (cleanup_dir_list != NULL) + { + cb_cleanup_dir *dir = cleanup_dir_list; + + if (dir->rmtopdir) + { + pg_log_info("removing output directory \"%s\"", dir->target_path); + if (!rmtree(dir->target_path, dir->rmtopdir)) + pg_log_error("failed to remove output directory"); + } + else + { + pg_log_info("removing contents of output directory \"%s\"", + dir->target_path); + if (!rmtree(dir->target_path, dir->rmtopdir)) + pg_log_error("failed to remove contents of output directory"); + } + + cleanup_dir_list = cleanup_dir_list->next; + pfree(dir); + } +} + +/* + * Create the named output directory, unless it already exists or we're in + * dry-run mode. If it already exists but is not empty, that's a fatal error. + * + * Adds the created directory to the list of directories to be cleaned up + * at process exit. + */ +static void +create_output_directory(char *dirname, cb_options *opt) +{ + switch (pg_check_dir(dirname)) + { + case 0: + if (opt->dry_run) + { + pg_log_debug("would create directory \"%s\"", dirname); + return; + } + pg_log_debug("creating directory \"%s\"", dirname); + if (pg_mkdir_p(dirname, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", dirname); + remember_to_cleanup_directory(dirname, true); + break; + + case 1: + pg_log_debug("using existing directory \"%s\"", dirname); + remember_to_cleanup_directory(dirname, false); + break; + + case 2: + case 3: + case 4: + pg_fatal("directory \"%s\" exists but is not empty", dirname); + + case -1: + pg_fatal("could not access directory \"%s\": %m", dirname); + } +} + +/* + * help + * + * Prints help page for the program + * + * progname: the name of the executed program, such as "pg_combinebackup" + */ +static void +help(const char *progname) +{ + printf(_("%s reconstructs full backups from incrementals.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]... DIRECTORY...\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" -d, --debug generate lots of debugging output\n")); + printf(_(" -n, --dry-run don't actually do anything\n")); + printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" -o, --output output directory\n")); + printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n")); + printf(_(" relocate tablespace in OLDDIR to NEWDIR\n")); + printf(_(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n" + " use algorithm for manifest checksums\n")); + printf(_(" --no-manifest suppress generation of backup manifest\n")); + printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); + printf(_(" -?, --help show this help, then exit\n")); + + printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); +} + +/* + * Try to parse a string as a non-zero OID without leading zeroes. + * + * If it works, return true and set *result to the answer, else return false. + */ +static bool +parse_oid(char *s, Oid *result) +{ + Oid oid; + char *ep; + + errno = 0; + oid = strtoul(s, &ep, 10); + if (errno != 0 || *ep != '\0' || oid < 1 || oid > PG_UINT32_MAX) + return false; + + *result = oid; + return true; +} + +/* + * Copy files from the input directory to the output directory, reconstructing + * full files from incremental files as required. + * + * If processing is a user-defined tablespace, the tsoid should be the OID + * of that tablespace and input_directory and output_directory should be the + * toplevel input and output directories for that tablespace. Otherwise, + * tsoid should be InvalidOid and input_directory and output_directory should + * be the main input and output directories. + * + * relative_path is the path beneath the given input and output directories + * that we are currently processing. If NULL, it indicates that we're + * processing the input and output directories themselves. + * + * n_prior_backups is the number of prior backups that we have available. + * This doesn't count the very last backup, which is referenced by + * output_directory, just the older ones. prior_backup_dirs is an array of + * the locations of those previous backups. + */ +static void +process_directory_recursively(Oid tsoid, + char *input_directory, + char *output_directory, + char *relative_path, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + manifest_writer *mwriter, + cb_options *opt) +{ + char ifulldir[MAXPGPATH]; + char ofulldir[MAXPGPATH]; + char manifest_prefix[MAXPGPATH]; + DIR *dir; + struct dirent *de; + bool is_pg_tblspc; + bool is_pg_wal; + manifest_data *latest_manifest = manifests[n_prior_backups]; + pg_checksum_type checksum_type; + + /* + * pg_tblspc and pg_wal are special cases, so detect those here. + * + * pg_tblspc is only special at the top level, but subdirectories of + * pg_wal are just as special as the top level directory. + * + * Since incremental backup does not exist in pre-v10 versions, we don't + * have to worry about the old pg_xlog naming. + */ + is_pg_tblspc = !OidIsValid(tsoid) && relative_path != NULL && + strcmp(relative_path, "pg_tblspc") == 0; + is_pg_wal = !OidIsValid(tsoid) && relative_path != NULL && + (strcmp(relative_path, "pg_wal") == 0 || + strncmp(relative_path, "pg_wal/", 7) == 0); + + /* + * If we're under pg_wal, then we don't need checksums, because these + * files aren't included in the backup manifest. Otherwise use whatever + * type of checksum is configured. + */ + if (!is_pg_wal) + checksum_type = opt->manifest_checksums; + else + checksum_type = CHECKSUM_TYPE_NONE; + + /* + * Append the relative path to the input and output directories, and + * figure out the appropriate prefix to add to files in this directory + * when looking them up in a backup manifest. + */ + if (relative_path == NULL) + { + strncpy(ifulldir, input_directory, MAXPGPATH); + strncpy(ofulldir, output_directory, MAXPGPATH); + if (OidIsValid(tsoid)) + snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/", tsoid); + else + manifest_prefix[0] = '\0'; + } + else + { + snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory, + relative_path); + snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory, + relative_path); + if (OidIsValid(tsoid)) + snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/", + tsoid, relative_path); + else + snprintf(manifest_prefix, MAXPGPATH, "%s/", relative_path); + } + + /* + * Toplevel output directories have already been created by the time this + * function is called, but any subdirectories are our responsibility. + */ + if (relative_path != NULL) + { + if (opt->dry_run) + pg_log_debug("would create directory \"%s\"", ofulldir); + else + { + pg_log_debug("creating directory \"%s\"", ofulldir); + if (mkdir(ofulldir, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", ofulldir); + } + } + + /* It's time to scan the directory. */ + if ((dir = opendir(ifulldir)) == NULL) + pg_fatal("could not open directory \"%s\": %m", ifulldir); + while (errno = 0, (de = readdir(dir)) != NULL) + { + PGFileType type; + char ifullpath[MAXPGPATH]; + char ofullpath[MAXPGPATH]; + char manifest_path[MAXPGPATH]; + Oid oid = InvalidOid; + int checksum_length = 0; + uint8 *checksum_payload = NULL; + pg_checksum_context checksum_ctx; + + /* Ignore "." and ".." entries. */ + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + /* Construct input path. */ + snprintf(ifullpath, MAXPGPATH, "%s/%s", ifulldir, de->d_name); + + /* Figure out what kind of directory entry this is. */ + type = get_dirent_type(ifullpath, de, false, PG_LOG_ERROR); + if (type == PGFILETYPE_ERROR) + exit(1); + + /* + * If we're processing pg_tblspc, then check whether the filename + * looks like it could be a tablespace OID. If so, and if the + * directory entry is a symbolic link or a directory, skip it. + * + * Our goal here is to ignore anything that would have been considered + * by scan_for_existing_tablespaces to be a tablespace. + */ + if (is_pg_tblspc && parse_oid(de->d_name, &oid) && + (type == PGFILETYPE_LNK || type == PGFILETYPE_DIR)) + continue; + + /* If it's a directory, recurse. */ + if (type == PGFILETYPE_DIR) + { + char new_relative_path[MAXPGPATH]; + + /* Append new pathname component to relative path. */ + if (relative_path == NULL) + strncpy(new_relative_path, de->d_name, MAXPGPATH); + else + snprintf(new_relative_path, MAXPGPATH, "%s/%s", relative_path, + de->d_name); + + /* And recurse. */ + process_directory_recursively(tsoid, + input_directory, output_directory, + new_relative_path, + n_prior_backups, prior_backup_dirs, + manifests, mwriter, opt); + continue; + } + + /* Skip anything that's not a regular file. */ + if (type != PGFILETYPE_REG) + { + if (type == PGFILETYPE_LNK) + pg_log_warning("skipping symbolic link \"%s\"", ifullpath); + else + pg_log_warning("skipping special file \"%s\"", ifullpath); + continue; + } + + /* + * Skip the backup_label and backup_manifest files; they require + * special handling and are handled elsewhere. + */ + if (relative_path == NULL && + (strcmp(de->d_name, "backup_label") == 0 || + strcmp(de->d_name, "backup_manifest") == 0)) + continue; + + /* + * If it's an incremental file, hand it off to the reconstruction + * code, which will figure out what to do. + */ + if (strncmp(de->d_name, INCREMENTAL_PREFIX, + INCREMENTAL_PREFIX_LENGTH) == 0) + { + /* Output path should not include "INCREMENTAL." prefix. */ + snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, + de->d_name + INCREMENTAL_PREFIX_LENGTH); + + + /* Manifest path likewise omits incremental prefix. */ + snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, + de->d_name + INCREMENTAL_PREFIX_LENGTH); + + /* Reconstruction logic will do the rest. */ + reconstruct_from_incremental_file(ifullpath, ofullpath, + relative_path, + de->d_name + INCREMENTAL_PREFIX_LENGTH, + n_prior_backups, + prior_backup_dirs, + manifests, + manifest_path, + checksum_type, + &checksum_length, + &checksum_payload, + opt->debug, + opt->dry_run); + } + else + { + /* Construct the path that the backup_manifest will use. */ + snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, + de->d_name); + + /* + * It's not an incremental file, so we need to copy the entire + * file to the output directory. + * + * If a checksum of the required type already exists in the + * backup_manifest for the final input directory, we can save some + * work by reusing that checksum instead of computing a new one. + */ + if (checksum_type != CHECKSUM_TYPE_NONE && + latest_manifest != NULL) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(latest_manifest->files, + manifest_path); + if (mfile == NULL) + { + char *bmpath; + + /* + * The directory is out of sync with the backup_manifest, + * so emit a warning. + */ + bmpath = psprintf("%s/%s", input_directory, + "backup_manifest"); + pg_log_warning("\"%s\" contains no entry for \"%s\"", + bmpath, manifest_path); + pfree(bmpath); + } + else if (mfile->checksum_type == checksum_type) + { + checksum_length = mfile->checksum_length; + checksum_payload = mfile->checksum_payload; + } + } + + /* + * If we're reusing a checksum, then we don't need copy_file() to + * compute one for us, but otherwise, it needs to compute whatever + * type of checksum we need. + */ + if (checksum_length != 0) + pg_checksum_init(&checksum_ctx, CHECKSUM_TYPE_NONE); + else + pg_checksum_init(&checksum_ctx, checksum_type); + + /* Actually copy the file. */ + snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name); + copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run); + + /* + * If copy_file() performed a checksum calculation for us, then + * save the results (except in dry-run mode, when there's no + * point). + */ + if (checksum_ctx.type != CHECKSUM_TYPE_NONE && !opt->dry_run) + { + checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + checksum_length = pg_checksum_final(&checksum_ctx, + checksum_payload); + } + } + + /* Generate manifest entry, if needed. */ + if (mwriter != NULL) + { + struct stat sb; + + /* + * In order to generate a manifest entry, we need the file size + * and mtime. We have no way to know the correct mtime except to + * stat() the file, so just do that and get the size as well. + * + * If we didn't need the mtime here, we could try to obtain the + * file size from the reconstruction or file copy process above, + * although that is actually not convenient in all cases. If we + * write the file ourselves then clearly we can keep a count of + * bytes, but if we use something like CopyFile() then it's + * trickier. Since we have to stat() anyway to get the mtime, + * there's no point in worrying about it. + */ + if (stat(ofullpath, &sb) < 0) + pg_fatal("could not stat file \"%s\": %m", ofullpath); + + /* OK, now do the work. */ + add_file_to_manifest(mwriter, manifest_path, + sb.st_size, sb.st_mtime, + checksum_type, checksum_length, + checksum_payload); + } + + /* Avoid leaking memory. */ + if (checksum_payload != NULL) + pfree(checksum_payload); + } + + closedir(dir); +} + +/* + * Read the version number from PG_VERSION and convert it to the usual server + * version number format. (e.g. If PG_VERSION contains "14\n" this function + * will return 140000) + */ +static int +read_pg_version_file(char *directory) +{ + char filename[MAXPGPATH]; + StringInfoData buf; + int fd; + int version; + char *ep; + + /* Construct pathname. */ + snprintf(filename, MAXPGPATH, "%s/PG_VERSION", directory); + + /* Open file. */ + if ((fd = open(filename, O_RDONLY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", filename); + + /* Read into memory. Length limit of 128 should be more than generous. */ + initStringInfo(&buf); + slurp_file(fd, filename, &buf, 128); + + /* Close the file. */ + if (close(fd) != 0) + pg_fatal("could not close \"%s\": %m", filename); + + /* Convert to integer. */ + errno = 0; + version = strtoul(buf.data, &ep, 10); + if (errno != 0 || *ep != '\n') + { + /* + * Incremental backup is not relevant to very old server versions that + * used multi-part version number (e.g. 9.6, or 8.4). So if we see + * what looks like the beginning of such a version number, just bail + * out. + */ + if (version < 10 && *ep == '.') + pg_fatal("%s: server version too old\n", filename); + pg_fatal("%s: could not parse version number\n", filename); + } + + /* Debugging output. */ + pg_log_debug("read server version %d from \"%s\"", version, filename); + + /* Release memory and return result. */ + pfree(buf.data); + return version * 10000; +} + +/* + * Add a directory to the list of output directories to clean up. + */ +static void +remember_to_cleanup_directory(char *target_path, bool rmtopdir) +{ + cb_cleanup_dir *dir = pg_malloc(sizeof(cb_cleanup_dir)); + + dir->target_path = target_path; + dir->rmtopdir = rmtopdir; + dir->next = cleanup_dir_list; + cleanup_dir_list = dir; +} + +/* + * Empty out the list of directories scheduled for cleanup a exit. + * + * We want to remove the output directories only on a failure, so call this + * function when we know that the operation has succeeded. + * + * Since we only expect this to be called when we're about to exit, we could + * just set cleanup_dir_list to NULL and be done with it, but we free the + * memory to be tidy. + */ +static void +reset_directory_cleanup_list(void) +{ + while (cleanup_dir_list != NULL) + { + cb_cleanup_dir *dir = cleanup_dir_list; + + cleanup_dir_list = cleanup_dir_list->next; + pfree(dir); + } +} + +/* + * Scan the pg_tblspc directory of the final input backup to get a canonical + * list of what tablespaces are part of the backup. + * + * 'pathname' should be the path to the toplevel backup directory for the + * final backup in the backup chain. + */ +static cb_tablespace * +scan_for_existing_tablespaces(char *pathname, cb_options *opt) +{ + char pg_tblspc[MAXPGPATH]; + DIR *dir; + struct dirent *de; + cb_tablespace *tslist = NULL; + + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pathname); + pg_log_debug("scanning \"%s\"", pg_tblspc); + + if ((dir = opendir(pg_tblspc)) == NULL) + pg_fatal("could not open directory \"%s\": %m", pathname); + + while (errno = 0, (de = readdir(dir)) != NULL) + { + Oid oid; + char tblspcdir[MAXPGPATH]; + char link_target[MAXPGPATH]; + int link_length; + cb_tablespace *ts; + cb_tablespace *otherts; + PGFileType type; + + /* Silently ignore "." and ".." entries. */ + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + /* Construct full pathname. */ + snprintf(tblspcdir, MAXPGPATH, "%s/%s", pg_tblspc, de->d_name); + + /* Ignore any file name that doesn't look like a proper OID. */ + if (!parse_oid(de->d_name, &oid)) + { + pg_log_debug("skipping \"%s\" because the filename is not a legal tablespace OID", + tblspcdir); + continue; + } + + /* Only symbolic links and directories are tablespaces. */ + type = get_dirent_type(tblspcdir, de, false, PG_LOG_ERROR); + if (type == PGFILETYPE_ERROR) + exit(1); + if (type != PGFILETYPE_LNK && type != PGFILETYPE_DIR) + { + pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor a directory", + tblspcdir); + continue; + } + + /* Create a new tablespace object. */ + ts = pg_malloc0(sizeof(cb_tablespace)); + ts->oid = oid; + + /* + * If it's a link, it's not an in-place tablespace. Otherwise, it must + * be a directory, and thus an in-place tablespace. + */ + if (type == PGFILETYPE_LNK) + { + cb_tablespace_mapping *tsmap; + + /* Read the link target. */ + link_length = readlink(tblspcdir, link_target, sizeof(link_target)); + if (link_length < 0) + pg_fatal("could not read symbolic link \"%s\": %m", + tblspcdir); + if (link_length >= sizeof(link_target)) + pg_fatal("symbolic link \"%s\" is too long", tblspcdir); + link_target[link_length] = '\0'; + if (!is_absolute_path(link_target)) + pg_fatal("symbolic link \"%s\" is relative", tblspcdir); + + /* Caonicalize the link target. */ + canonicalize_path(link_target); + + /* + * Find the corresponding tablespace mapping and copy the relevant + * details into the new tablespace entry. + */ + for (tsmap = opt->tsmappings; tsmap != NULL; tsmap = tsmap->next) + { + if (strcmp(tsmap->old_dir, link_target) == 0) + { + strncpy(ts->old_dir, tsmap->old_dir, MAXPGPATH); + strncpy(ts->new_dir, tsmap->new_dir, MAXPGPATH); + ts->in_place = false; + break; + } + } + + /* Every non-in-place tablespace must be mapped. */ + if (tsmap == NULL) + pg_fatal("tablespace at \"%s\" has no tablespace mapping", + link_target); + } + else + { + /* + * For an in-place tablespace, there's no separate directory, so + * we just record the paths within the data directories. + */ + snprintf(ts->old_dir, MAXPGPATH, "%s/%s", pg_tblspc, de->d_name); + snprintf(ts->new_dir, MAXPGPATH, "%s/pg_tblpc/%s", opt->output, + de->d_name); + ts->in_place = true; + } + + /* Tablespaces should not share a directory. */ + for (otherts = tslist; otherts != NULL; otherts = otherts->next) + if (strcmp(ts->new_dir, otherts->new_dir) == 0) + pg_fatal("tablespaces with OIDs %u and %u both point at \"%s\"", + otherts->oid, oid, ts->new_dir); + + /* Add this tablespace to the list. */ + ts->next = tslist; + tslist = ts; + } + + return tslist; +} + +/* + * Read a file into a StringInfo. + * + * fd is used for the actual file I/O, filename for error reporting purposes. + * A file longer than maxlen is a fatal error. + */ +static void +slurp_file(int fd, char *filename, StringInfo buf, int maxlen) +{ + struct stat st; + ssize_t rb; + + /* Check file size, and complain if it's too large. */ + if (fstat(fd, &st) != 0) + pg_fatal("could not stat \"%s\": %m", filename); + if (st.st_size > maxlen) + pg_fatal("file \"%s\" is too large", filename); + + /* Make sure we have enough space. */ + enlargeStringInfo(buf, st.st_size); + + /* Read the data. */ + rb = read(fd, &buf->data[buf->len], st.st_size); + + /* + * We don't expect any concurrent changes, so we should read exactly the + * expected number of bytes. + */ + if (rb != st.st_size) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + filename, (int) rb, (int) st.st_size); + } + + /* Adjust buffer length for new data and restore trailing-\0 invariant */ + buf->len += rb; + buf->data[buf->len] = '\0'; +} diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c new file mode 100644 index 00000000000..6decdd89340 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.c @@ -0,0 +1,687 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.c + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <unistd.h> + +#include "backup/basebackup_incremental.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "copy_file.h" +#include "lib/stringinfo.h" +#include "reconstruct.h" +#include "storage/block.h" + +/* + * An rfile stores the data that we need in order to be able to use some file + * on disk for reconstruction. For any given output file, we create one rfile + * per backup that we need to consult when we constructing that output file. + * + * If we find a full version of the file in the backup chain, then only + * filename and fd are initialized; the remaining fields are 0 or NULL. + * For an incremental file, header_length, num_blocks, relative_block_numbers, + * and truncation_block_length are also set. + * + * num_blocks_read and highest_offset_read always start out as 0. + */ +typedef struct rfile +{ + char *filename; + int fd; + size_t header_length; + unsigned num_blocks; + BlockNumber *relative_block_numbers; + unsigned truncation_block_length; + unsigned num_blocks_read; + off_t highest_offset_read; +} rfile; + +static void debug_reconstruction(int n_source, + rfile **sources, + bool dry_run); +static unsigned find_reconstructed_block_length(rfile *s); +static rfile *make_incremental_rfile(char *filename); +static rfile *make_rfile(char *filename, bool missing_ok); +static void write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool debug, + bool dry_run); +static void read_bytes(rfile *rf, void *buffer, unsigned length); + +/* + * Reconstruct a full file from an incremental file and a chain of prior + * backups. + * + * input_filename should be the path to the incremental file, and + * output_filename should be the path where the reconstructed file is to be + * written. + * + * relative_path should be the relative path to the directory containing this + * file. bare_file_name should be the name of the file within that directory, + * without "INCREMENTAL.". + * + * n_prior_backups is the number of prior backups, and prior_backup_dirs is + * an array of pathnames where those backups can be found. + */ +void +reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool debug, + bool dry_run) +{ + rfile **source; + rfile *latest_source = NULL; + rfile **sourcemap; + off_t *offsetmap; + unsigned block_length; + unsigned i; + unsigned sidx = n_prior_backups; + bool full_copy_possible = true; + int copy_source_index = -1; + rfile *copy_source = NULL; + pg_checksum_context checksum_ctx; + + /* + * Every block must come either from the latest version of the file or + * from one of the prior backups. + */ + source = pg_malloc0(sizeof(rfile *) * (1 + n_prior_backups)); + + /* + * Use the information from the latest incremental file to figure out how + * long the reconstructed file should be. + */ + latest_source = make_incremental_rfile(input_filename); + source[n_prior_backups] = latest_source; + block_length = find_reconstructed_block_length(latest_source); + + /* + * For each block in the output file, we need to know from which file we + * need to obtain it and at what offset in that file it's stored. + * sourcemap gives us the first of these things, and offsetmap the latter. + */ + sourcemap = pg_malloc0(sizeof(rfile *) * block_length); + offsetmap = pg_malloc0(sizeof(off_t) * block_length); + + /* + * Every block that is present in the newest incremental file should be + * sourced from that file. If it precedes the truncation_block_length, + * it's a block that we would otherwise have had to find in an older + * backup and thus reduces the number of blocks remaining to be found by + * one; otherwise, it's an extra block that needs to be included in the + * output but would not have needed to be found in an older backup if it + * had not been present. + */ + for (i = 0; i < latest_source->num_blocks; ++i) + { + BlockNumber b = latest_source->relative_block_numbers[i]; + + Assert(b < block_length); + sourcemap[b] = latest_source; + offsetmap[b] = latest_source->header_length + (i * BLCKSZ); + + /* + * A full copy of a file from an earlier backup is only possible if no + * blocks are needed from any later incremental file. + */ + full_copy_possible = false; + } + + while (1) + { + char source_filename[MAXPGPATH]; + rfile *s; + + /* + * Move to the next backup in the chain. If there are no more, then + * we're done. + */ + if (sidx == 0) + break; + --sidx; + + /* + * Look for the full file in the previous backup. If not found, then + * look for an incremental file instead. + */ + snprintf(source_filename, MAXPGPATH, "%s/%s/%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + if ((s = make_rfile(source_filename, true)) == NULL) + { + snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + s = make_incremental_rfile(source_filename); + } + source[sidx] = s; + + /* + * If s->header_length == 0, then this is a full file; otherwise, it's + * an incremental file. + */ + if (s->header_length == 0) + { + struct stat sb; + BlockNumber b; + BlockNumber blocklength; + + /* We need to know the length of the file. */ + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + + /* + * Since we found a full file, source all blocks from it that + * exist in the file. + * + * Note that there may be blocks that don't exist either in this + * file or in any incremental file but that precede + * truncation_block_length. These are, presumably, zero-filled + * blocks that result from the server extending the file but + * taking no action on those blocks that generated any WAL. + * + * Sadly, we have no way of validating that this is really what + * happened, and neither does the server. From it's perspective, + * an unmodified block that contains data looks exactly the same + * as a zero-filled block that never had any data: either way, + * it's not mentioned in any WAL summary and the server has no + * reason to read it. From our perspective, all we know is that + * nobody had a reason to back up the block. That certainly means + * that the block didn't exist at the time of the full backup, but + * the supposition that it was all zeroes at the time of every + * later backup is one that we can't validate. + */ + blocklength = sb.st_size / BLCKSZ; + for (b = 0; b < latest_source->truncation_block_length; ++b) + { + if (sourcemap[b] == NULL && b < blocklength) + { + sourcemap[b] = s; + offsetmap[b] = b * BLCKSZ; + } + } + + /* + * If a full copy looks possible, check whether the resulting file + * should be exactly as long as the source file is. If so, a full + * copy is acceptable, otherwise not. + */ + if (full_copy_possible) + { + uint64 expected_length; + + expected_length = + (uint64) latest_source->truncation_block_length; + expected_length *= BLCKSZ; + if (expected_length == sb.st_size) + { + copy_source = s; + copy_source_index = sidx; + } + } + + /* We don't need to consider any further sources. */ + break; + } + + /* + * Since we found another incremental file, source all blocks from it + * that we need but don't yet have. + */ + for (i = 0; i < s->num_blocks; ++i) + { + BlockNumber b = s->relative_block_numbers[i]; + + if (b < latest_source->truncation_block_length && + sourcemap[b] == NULL) + { + sourcemap[b] = s; + offsetmap[b] = s->header_length + (i * BLCKSZ); + + /* + * A full copy of a file from an earlier backup is only + * possible if no blocks are needed from any later incremental + * file. + */ + full_copy_possible = false; + } + } + } + + /* + * If a checksum of the required type already exists in the + * backup_manifest for the relevant input directory, we can save some work + * by reusing that checksum instead of computing a new one. + */ + if (copy_source_index >= 0 && manifests[copy_source_index] != NULL && + checksum_type != CHECKSUM_TYPE_NONE) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(manifests[copy_source_index]->files, + manifest_path); + if (mfile == NULL) + { + char *path = psprintf("%s/backup_manifest", + prior_backup_dirs[copy_source_index]); + + /* + * The directory is out of sync with the backup_manifest, so emit + * a warning. + */ + /*- translator: the first %s is a backup manifest file, the second is a file absent therein */ + pg_log_warning("\"%s\" contains no entry for \"%s\"", + path, + manifest_path); + pfree(path); + } + else if (mfile->checksum_type == checksum_type) + { + *checksum_length = mfile->checksum_length; + *checksum_payload = pg_malloc(*checksum_length); + memcpy(*checksum_payload, mfile->checksum_payload, + *checksum_length); + checksum_type = CHECKSUM_TYPE_NONE; + } + } + + /* Prepare for checksum calculation, if required. */ + pg_checksum_init(&checksum_ctx, checksum_type); + + /* + * If the full file can be created by copying a file from an older backup + * in the chain without needing to overwrite any blocks or truncate the + * result, then forget about performing reconstruction and just copy that + * file in its entirety. + * + * Otherwise, reconstruct. + */ + if (copy_source != NULL) + copy_file(copy_source->filename, output_filename, + &checksum_ctx, dry_run); + else + { + write_reconstructed_file(input_filename, output_filename, + block_length, sourcemap, offsetmap, + &checksum_ctx, debug, dry_run); + debug_reconstruction(n_prior_backups + 1, source, dry_run); + } + + /* Save results of checksum calculation. */ + if (checksum_type != CHECKSUM_TYPE_NONE) + { + *checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + *checksum_length = pg_checksum_final(&checksum_ctx, + *checksum_payload); + } + + /* + * Close files and release memory. + */ + for (i = 0; i <= n_prior_backups; ++i) + { + rfile *s = source[i]; + + if (s == NULL) + continue; + if (close(s->fd) != 0) + pg_fatal("could not close \"%s\": %m", s->filename); + if (s->relative_block_numbers != NULL) + pfree(s->relative_block_numbers); + pg_free(s->filename); + } + pfree(sourcemap); + pfree(offsetmap); + pfree(source); +} + +/* + * Perform post-reconstruction logging and sanity checks. + */ +static void +debug_reconstruction(int n_source, rfile **sources, bool dry_run) +{ + unsigned i; + + for (i = 0; i < n_source; ++i) + { + rfile *s = sources[i]; + + /* Ignore source if not used. */ + if (s == NULL) + continue; + + /* If no data is needed from this file, we can ignore it. */ + if (s->num_blocks_read == 0) + continue; + + /* Debug logging. */ + if (dry_run) + pg_log_debug("would have read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + else + pg_log_debug("read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + + /* + * In dry-run mode, we don't actually try to read data from the file, + * but we do try to verify that the file is long enough that we could + * have read the data if we'd tried. + * + * If this fails, then it means that a non-dry-run attempt would fail, + * complaining of not being able to read the required bytes from the + * file. + */ + if (dry_run) + { + struct stat sb; + + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + if (sb.st_size < s->highest_offset_read) + pg_fatal("file \"%s\" is too short: expected %llu, found %llu", + s->filename, + (unsigned long long) s->highest_offset_read, + (unsigned long long) sb.st_size); + } + } +} + +/* + * When we perform reconstruction using an incremental file, the output file + * should be at least as long as the truncation_block_length. Any blocks + * present in the incremental file increase the output length as far as is + * necessary to include those blocks. + */ +static unsigned +find_reconstructed_block_length(rfile *s) +{ + unsigned block_length = s->truncation_block_length; + unsigned i; + + for (i = 0; i < s->num_blocks; ++i) + if (s->relative_block_numbers[i] >= block_length) + block_length = s->relative_block_numbers[i] + 1; + + return block_length; +} + +/* + * Initialize an incremental rfile, reading the header so that we know which + * blocks it contains. + */ +static rfile * +make_incremental_rfile(char *filename) +{ + rfile *rf; + unsigned magic; + + rf = make_rfile(filename, false); + + /* Read and validate magic number. */ + read_bytes(rf, &magic, sizeof(magic)); + if (magic != INCREMENTAL_MAGIC) + pg_fatal("file \"%s\" has bad incremental magic number (0x%x not 0x%x)", + filename, magic, INCREMENTAL_MAGIC); + + /* Read block count. */ + read_bytes(rf, &rf->num_blocks, sizeof(rf->num_blocks)); + if (rf->num_blocks > RELSEG_SIZE) + pg_fatal("file \"%s\" has block count %u in excess of segment size %u", + filename, rf->num_blocks, RELSEG_SIZE); + + /* Read truncation block length. */ + read_bytes(rf, &rf->truncation_block_length, + sizeof(rf->truncation_block_length)); + if (rf->truncation_block_length > RELSEG_SIZE) + pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u", + filename, rf->truncation_block_length, RELSEG_SIZE); + + /* Read block numbers if there are any. */ + if (rf->num_blocks > 0) + { + rf->relative_block_numbers = + pg_malloc0(sizeof(BlockNumber) * rf->num_blocks); + read_bytes(rf, rf->relative_block_numbers, + sizeof(BlockNumber) * rf->num_blocks); + } + + /* Remember length of header. */ + rf->header_length = sizeof(magic) + sizeof(rf->num_blocks) + + sizeof(rf->truncation_block_length) + + sizeof(BlockNumber) * rf->num_blocks; + + return rf; +} + +/* + * Allocate and perform basic initialization of an rfile. + */ +static rfile * +make_rfile(char *filename, bool missing_ok) +{ + rfile *rf; + + rf = pg_malloc0(sizeof(rfile)); + rf->filename = pstrdup(filename); + if ((rf->fd = open(filename, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (missing_ok && errno == ENOENT) + { + pg_free(rf); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", filename); + } + + return rf; +} + +/* + * Read the indicated number of bytes from an rfile into the buffer. + */ +static void +read_bytes(rfile *rf, void *buffer, unsigned length) +{ + unsigned rb = read(rf->fd, buffer, length); + + if (rb != length) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", rf->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + rf->filename, (int) rb, length); + } +} + +/* + * Write out a reconstructed file. + */ +static void +write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool debug, + bool dry_run) +{ + int wfd = -1; + unsigned i; + unsigned zero_blocks = 0; + + /* Debugging output. */ + if (debug) + { + StringInfoData debug_buf; + unsigned start_of_range = 0; + unsigned current_block = 0; + + /* Basic information about the output file to be produced. */ + if (dry_run) + pg_log_debug("would reconstruct \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + else + pg_log_debug("reconstructing \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + + /* Print out the plan for reconstructing this file. */ + initStringInfo(&debug_buf); + while (current_block < block_length) + { + rfile *s = sourcemap[current_block]; + + /* Extend range, if possible. */ + if (current_block + 1 < block_length && + s == sourcemap[current_block + 1]) + { + ++current_block; + continue; + } + + /* Add details about this range. */ + if (s == NULL) + { + if (current_block == start_of_range) + appendStringInfo(&debug_buf, " %u:zero", current_block); + else + appendStringInfo(&debug_buf, " %u-%u:zero", + start_of_range, current_block); + } + else + { + if (current_block == start_of_range) + appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT, + current_block, + s == NULL ? "ZERO" : s->filename, + (uint64) offsetmap[current_block]); + else + appendStringInfo(&debug_buf, " %u-%u:%s@" UINT64_FORMAT, + start_of_range, current_block, + s == NULL ? "ZERO" : s->filename, + (uint64) offsetmap[current_block]); + } + + /* Begin new range. */ + start_of_range = ++current_block; + + /* If the output is very long or we are done, dump it now. */ + if (current_block == block_length || debug_buf.len > 1024) + { + pg_log_debug("reconstruction plan:%s", debug_buf.data); + resetStringInfo(&debug_buf); + } + } + + /* Free memory. */ + pfree(debug_buf.data); + } + + /* Open the output file, except in dry_run mode. */ + if (!dry_run && + (wfd = open(output_filename, + O_RDWR | PG_BINARY | O_CREAT | O_EXCL, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + /* Read and write the blocks as required. */ + for (i = 0; i < block_length; ++i) + { + uint8 buffer[BLCKSZ]; + rfile *s = sourcemap[i]; + unsigned wb; + + /* Update accounting information. */ + if (s == NULL) + ++zero_blocks; + else + { + s->num_blocks_read++; + s->highest_offset_read = Max(s->highest_offset_read, + offsetmap[i] + BLCKSZ); + } + + /* Skip the rest of this in dry-run mode. */ + if (dry_run) + continue; + + /* Read or zero-fill the block as appropriate. */ + if (s == NULL) + { + /* + * New block not mentioned in the WAL summary. Should have been an + * uninitialized block, so just zero-fill it. + */ + memset(buffer, 0, BLCKSZ); + } + else + { + unsigned rb; + + /* Read the block from the correct source, except if dry-run. */ + rb = pg_pread(s->fd, buffer, BLCKSZ, offsetmap[i]); + if (rb != BLCKSZ) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", s->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %u", + s->filename, (int) rb, BLCKSZ, + (unsigned) offsetmap[i]); + } + } + + /* Write out the block. */ + if ((wb = write(wfd, buffer, BLCKSZ)) != BLCKSZ) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, BLCKSZ); + } + + /* Update the checksum computation. */ + if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + /* Debugging output. */ + if (zero_blocks > 0) + { + if (dry_run) + pg_log_debug("would have zero-filled %u blocks", zero_blocks); + else + pg_log_debug("zero-filled %u blocks", zero_blocks); + } + + /* Close the output file. */ + if (wfd >= 0 && close(wfd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); +} diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h new file mode 100644 index 00000000000..d689aeb5c20 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.h + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.h + * + *------------------------------------------------------------------------- + */ +#ifndef RECONSTRUCT_H +#define RECONSTRUCT_H + +#include "common/checksum_helper.h" +#include "load_manifest.h" + +extern void reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool debug, + bool dry_run); + +#endif diff --git a/src/bin/pg_combinebackup/t/001_basic.pl b/src/bin/pg_combinebackup/t/001_basic.pl new file mode 100644 index 00000000000..fb66075d1a8 --- /dev/null +++ b/src/bin/pg_combinebackup/t/001_basic.pl @@ -0,0 +1,23 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use PostgreSQL::Test::Utils; +use Test::More; + +my $tempdir = PostgreSQL::Test::Utils::tempdir; + +program_help_ok('pg_combinebackup'); +program_version_ok('pg_combinebackup'); +program_options_handling_ok('pg_combinebackup'); + +command_fails_like( + ['pg_combinebackup'], + qr/no input directories specified/, + 'input directories must be specified'); +command_fails_like( + [ 'pg_combinebackup', $tempdir ], + qr/no output directory specified/, + 'output directory must be specified'); + +done_testing(); diff --git a/src/bin/pg_combinebackup/t/002_compare_backups.pl b/src/bin/pg_combinebackup/t/002_compare_backups.pl new file mode 100644 index 00000000000..0b80455aff1 --- /dev/null +++ b/src/bin/pg_combinebackup/t/002_compare_backups.pl @@ -0,0 +1,154 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $primary = PostgreSQL::Test::Cluster->new('primary'); +$primary->init(has_archiving => 1, allows_streaming => 1); +$primary->append_conf('postgresql.conf', 'summarize_wal = on'); +$primary->start; + +# Create some test tables, each containing one row of data, plus a whole +# extra database. +$primary->safe_psql('postgres', <<EOM); +CREATE TABLE will_change (a int, b text); +INSERT INTO will_change VALUES (1, 'initial test row'); +CREATE TABLE will_grow (a int, b text); +INSERT INTO will_grow VALUES (1, 'initial test row'); +CREATE TABLE will_shrink (a int, b text); +INSERT INTO will_shrink VALUES (1, 'initial test row'); +CREATE TABLE will_get_vacuumed (a int, b text); +INSERT INTO will_get_vacuumed VALUES (1, 'initial test row'); +CREATE TABLE will_get_dropped (a int, b text); +INSERT INTO will_get_dropped VALUES (1, 'initial test row'); +CREATE TABLE will_get_rewritten (a int, b text); +INSERT INTO will_get_rewritten VALUES (1, 'initial test row'); +CREATE DATABASE db_will_get_dropped; +EOM + +# Take a full backup. +my $backup1path = $primary->backup_dir . '/backup1'; +$primary->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup"); + +# Now make some database changes. +$primary->safe_psql('postgres', <<EOM); +UPDATE will_change SET b = 'modified value' WHERE a = 1; +INSERT INTO will_grow + SELECT g, 'additional row' FROM generate_series(2, 5000) g; +TRUNCATE will_shrink; +VACUUM will_get_vacuumed; +DROP TABLE will_get_dropped; +CREATE TABLE newly_created (a int, b text); +INSERT INTO newly_created VALUES (1, 'row for new table'); +VACUUM FULL will_get_rewritten; +DROP DATABASE db_will_get_dropped; +CREATE DATABASE db_newly_created; +EOM + +# Take an incremental backup. +my $backup2path = $primary->backup_dir . '/backup2'; +$primary->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup"); + +# Find an LSN to which either backup can be recovered. +my $lsn = $primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();"); + +# Make sure that the WAL segment containing that LSN has been archived. +# PostgreSQL won't issue two consecutive XLOG_SWITCH records, and the backup +# just issued one, so call txid_current() to generate some WAL activity +# before calling pg_switch_wal(). +$primary->safe_psql('postgres', 'SELECT txid_current();'); +$primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); + +# Now wait for the LSN we chose above to be archived. +my $archive_wait_query = + "SELECT pg_walfile_name('$lsn') <= last_archived_wal FROM pg_stat_archiver;"; +$primary->poll_query_until('postgres', $archive_wait_query) + or die "Timed out while waiting for WAL segment to be archived"; + +# Perform PITR from the full backup. Disable archive_mode so that the archive +# doesn't find out about the new timeline; that way, the later PITR below will +# choose the same timeline. +my $pitr1 = PostgreSQL::Test::Cluster->new('pitr1'); +$pitr1->init_from_backup($primary, 'backup1', + standby => 1, has_restoring => 1); +$pitr1->append_conf('postgresql.conf', qq{ +recovery_target_lsn = '$lsn' +recovery_target_action = 'promote' +archive_mode = 'off' +}); +$pitr1->start(); + +# Perform PITR to the same LSN from the incremental backup. Use the same +# basic configuration as before. +my $pitr2 = PostgreSQL::Test::Cluster->new('pitr2'); +$pitr2->init_from_backup($primary, 'backup2', + standby => 1, has_restoring => 1, + combine_with_prior => [ 'backup1' ]); +$pitr2->append_conf('postgresql.conf', qq{ +recovery_target_lsn = '$lsn' +recovery_target_action = 'promote' +archive_mode = 'off' +}); +$pitr2->start(); + +# Wait until both servers exit recovery. +$pitr1->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery();") + or die "Timed out while waiting apply to reach LSN $lsn"; +$pitr2->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery();") + or die "Timed out while waiting apply to reach LSN $lsn"; + +# Perform a logical dump of each server, and check that they match. +# It would be much nicer if we could physically compare the data files, but +# that doesn't really work. The contents of the page hole aren't guaranteed to +# be identical, and there can be other discrepancies as well. To make this work +# we'd need the equivalent of each AM's rm_mask functon written or at least +# callable from Perl, and that doesn't seem practical. +# +# NB: We're just using the primary's backup directory for scratch space here. +# This could equally well be any other directory we wanted to pick. +my $backupdir = $primary->backup_dir; +my $dump1 = $backupdir . '/pitr1.dump'; +my $dump2 = $backupdir . '/pitr2.dump'; +$pitr1->command_ok([ + 'pg_dumpall', '-f', $dump1, '--no-sync', '--no-unlogged-table-data', + '-d', $pitr1->connstr('postgres'), + ], + 'dump from PITR 1'); +$pitr1->command_ok([ + 'pg_dumpall', '-f', $dump2, '--no-sync', '--no-unlogged-table-data', + '-d', $pitr1->connstr('postgres'), + ], + 'dump from PITR 2'); + +# Compare the two dumps, there should be no differences. +my $compare_res = compare($dump1, $dump2); +note($dump1); +note($dump2); +is($compare_res, 0, "dumps are identical"); + +# Provide more context if the dumps do not match. +if ($compare_res != 0) +{ + my ($stdout, $stderr) = + run_command([ 'diff', '-u', $dump1, $dump2 ]); + print "=== diff of $dump1 and $dump2\n"; + print "=== stdout ===\n"; + print $stdout; + print "=== stderr ===\n"; + print $stderr; + print "=== EOF ===\n"; +} + +done_testing(); diff --git a/src/bin/pg_combinebackup/t/003_timeline.pl b/src/bin/pg_combinebackup/t/003_timeline.pl new file mode 100644 index 00000000000..bc053ca5e8c --- /dev/null +++ b/src/bin/pg_combinebackup/t/003_timeline.pl @@ -0,0 +1,90 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group +# +# This test aims to validate that restoring an incremental backup works +# properly even when the reference backup is on a different timeline. + +use strict; +use warnings; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $node1 = PostgreSQL::Test::Cluster->new('node1'); +$node1->init(has_archiving => 1, allows_streaming => 1); +$node1->append_conf('postgresql.conf', 'summarize_wal = on'); +$node1->start; + +# Create a table and insert a test row into it. +$node1->safe_psql('postgres', <<EOM); +CREATE TABLE mytable (a int, b text); +INSERT INTO mytable VALUES (1, 'aardvark'); +EOM + +# Take a full backup. +my $backup1path = $node1->backup_dir . '/backup1'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup from node1"); + +# Insert a second row on the original node. +$node1->safe_psql('postgres', <<EOM); +INSERT INTO mytable VALUES (2, 'beetle'); +EOM + +# Now take an incremental backup. +my $backup2path = $node1->backup_dir . '/backup2'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup from node1"); + +# Restore the incremental backup and use it to create a new node. +my $node2 = PostgreSQL::Test::Cluster->new('node2'); +$node2->init_from_backup($node1, 'backup2', + combine_with_prior => [ 'backup1' ]); +$node2->start(); + +# Insert rows on both nodes. +$node1->safe_psql('postgres', <<EOM); +INSERT INTO mytable VALUES (3, 'crab'); +EOM +$node2->safe_psql('postgres', <<EOM); +INSERT INTO mytable VALUES (4, 'dingo'); +EOM + +# Take another incremental backup, from node2, based on backup2 from node1. +my $backup3path = $node1->backup_dir . '/backup3'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backup3path, '--no-sync', '-cfast', + '--incremental', $backup2path . '/backup_manifest' ], + "incremental backup from node2"); + +# Restore the incremental backup and use it to create a new node. +my $node3 = PostgreSQL::Test::Cluster->new('node3'); +$node3->init_from_backup($node1, 'backup3', + combine_with_prior => [ 'backup1', 'backup2' ]); +$node3->start(); + +# Let's insert one more row. +$node3->safe_psql('postgres', <<EOM); +INSERT INTO mytable VALUES (5, 'elephant'); +EOM + +# Now check that we have the expected rows. +my $result = $node3->safe_psql('postgres', <<EOM); +select string_agg(a::text, ':'), string_agg(b, ':') from mytable; +EOM +is($result, '1:2:4:5|aardvark:beetle:dingo:elephant'); + +# Let's also verify all the backups. +for my $backup_name (qw(backup1 backup2 backup3)) +{ + $node1->command_ok( + [ 'pg_verifybackup', $node1->backup_dir . '/' . $backup_name ], + "verify backup $backup_name"); +} + +# OK, that's all. +done_testing(); diff --git a/src/bin/pg_combinebackup/t/004_manifest.pl b/src/bin/pg_combinebackup/t/004_manifest.pl new file mode 100644 index 00000000000..37de61ac061 --- /dev/null +++ b/src/bin/pg_combinebackup/t/004_manifest.pl @@ -0,0 +1,75 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group +# +# This test aims to validate that pg_combinebackup works in the degenerate +# case where it is invoked on a single full backup and that it can produce +# a new, valid manifest when it does. Secondarily, it checks that +# pg_combinebackup does not produce a manifest when run with --no-manifest. + +use strict; +use warnings; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init(has_archiving => 1, allows_streaming => 1); +$node->start; + +# Take a full backup. +my $original_backup_path = $node->backup_dir . '/original'; +$node->command_ok( + [ 'pg_basebackup', '-D', $original_backup_path, '--no-sync', '-cfast' ], + "full backup"); + +# Verify the full backup. +$node->command_ok([ 'pg_verifybackup', $original_backup_path ], + "verify original backup"); + +# Process the backup with pg_combinebackup using various manifest options. +sub combine_and_test_one_backup +{ + my ($backup_name, $failure_pattern, @extra_options) = @_; + my $revised_backup_path = $node->backup_dir . '/' . $backup_name; + $node->command_ok( + [ 'pg_combinebackup', $original_backup_path, '-o', $revised_backup_path, + '--no-sync', @extra_options ], + "pg_combinebackup with @extra_options"); + if (defined $failure_pattern) + { + $node->command_fails_like( + [ 'pg_verifybackup', $revised_backup_path ], + $failure_pattern, + "unable to verify backup $backup_name"); + } + else + { + $node->command_ok( + [ 'pg_verifybackup', $revised_backup_path ], + "verify backup $backup_name"); + } +} +combine_and_test_one_backup('nomanifest', + qr/could not open file.*backup_manifest/, '--no-manifest'); +combine_and_test_one_backup('csum_none', + undef, '--manifest-checksums=NONE'); +combine_and_test_one_backup('csum_sha224', + undef, '--manifest-checksums=SHA224'); + +# Verify that SHA224 is mentioned in the SHA224 manifest lots of times. +my $sha224_manifest = + slurp_file($node->backup_dir . '/csum_sha224/backup_manifest'); +my $sha224_count = (() = $sha224_manifest =~ /SHA224/mig); +cmp_ok($sha224_count, + '>', 100, "SHA224 is mentioned many times in SHA224 manifest"); + +# Verify that SHA224 is mentioned in the SHA224 manifest lots of times. +my $nocsum_manifest = + slurp_file($node->backup_dir . '/csum_none/backup_manifest'); +my $nocsum_count = (() = $nocsum_manifest =~ /Checksum-Algorithm/mig); +is($nocsum_count, 0, + "Checksum_Algorithm is not mentioned in no-checksum manifest"); + +# OK, that's all. +done_testing(); diff --git a/src/bin/pg_combinebackup/t/005_integrity.pl b/src/bin/pg_combinebackup/t/005_integrity.pl new file mode 100644 index 00000000000..b1f63a43e07 --- /dev/null +++ b/src/bin/pg_combinebackup/t/005_integrity.pl @@ -0,0 +1,125 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group +# +# This test aims to validate that an incremental backup can be combined +# with a valid prior backup and that it cannot be combined with an invalid +# prior backup. + +use strict; +use warnings; +use File::Compare; +use File::Path qw(rmtree); +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $node1 = PostgreSQL::Test::Cluster->new('node1'); +$node1->init(has_archiving => 1, allows_streaming => 1); +$node1->append_conf('postgresql.conf', 'summarize_wal = on'); +$node1->start; + +# Set up another new database instance. We don't want to use the cached +# INITDB_TEMPLATE for this, because we want it to be a separate cluster +# with a different system ID. +my $node2; +{ + local $ENV{'INITDB_TEMPLATE'} = undef; + + $node2 = PostgreSQL::Test::Cluster->new('node2'); + $node2->init(has_archiving => 1, allows_streaming => 1); + $node2->append_conf('postgresql.conf', 'summarize_wal = on'); + $node2->start; +} + +# Take a full backup from node1. +my $backup1path = $node1->backup_dir . '/backup1'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup from node1"); + +# Now take an incremental backup. +my $backup2path = $node1->backup_dir . '/backup2'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup from node1"); + +# Now take another incremental backup. +my $backup3path = $node1->backup_dir . '/backup3'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup3path, '--no-sync', '-cfast', + '--incremental', $backup2path . '/backup_manifest' ], + "another incremental backup from node1"); + +# Take a full backup from node2. +my $backupother1path = $node1->backup_dir . '/backupother1'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backupother1path, '--no-sync', '-cfast' ], + "full backup from node2"); + +# Take an incremental backup from node2. +my $backupother2path = $node1->backup_dir . '/backupother2'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backupother2path, '--no-sync', '-cfast', + '--incremental', $backupother1path . '/backup_manifest' ], + "incremental backup from node2"); + +# Result directory. +my $resultpath = $node1->backup_dir . '/result'; + +# Can't combine 2 full backups. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backup1path, '-o', $resultpath ], + qr/is a full backup, but only the first backup should be a full backup/, + "can't combine full backups"); + +# Can't combine 2 incremental backups. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup2path, $backup2path, '-o', $resultpath ], + qr/is an incremental backup, but the first backup should be a full backup/, + "can't combine full backups"); + +# Can't combine full backup with an incremental backup from a different system. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backupother2path, '-o', $resultpath ], + qr/expected system identifier.*but found/, + "can't combine backups from different nodes"); + +# Can't omit a required backup. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backup3path, '-o', $resultpath ], + qr/starts at LSN.*but expected/, + "can't omit a required backup"); + +# Can't combine backups in the wrong order. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backup3path, $backup2path, '-o', $resultpath ], + qr/starts at LSN.*but expected/, + "can't combine backups in the wrong order"); + +# Can combine 3 backups that match up properly. +$node1->command_ok( + [ 'pg_combinebackup', $backup1path, $backup2path, $backup3path, '-o', $resultpath ], + "can combine 3 matching backups"); +rmtree($resultpath); + +# Can combine full backup with first incremental. +my $synthetic12path = $node1->backup_dir . '/synthetic12'; +$node1->command_ok( + [ 'pg_combinebackup', $backup1path, $backup2path, '-o', $synthetic12path ], + "can combine 2 matching backups"); + +# Can combine result of previous step with second incremental. +$node1->command_ok( + [ 'pg_combinebackup', $synthetic12path, $backup3path, '-o', $resultpath ], + "can combine synthetic backup with later incremental"); +rmtree($resultpath); + +# Can't combine result of 1+2 with 2. +$node1->command_fails_like( + [ 'pg_combinebackup', $synthetic12path, $backup2path, '-o', $resultpath ], + qr/starts at LSN.*but expected/, + "can't combine synthetic backup with included incremental"); + +# OK, that's all. +done_testing(); diff --git a/src/bin/pg_combinebackup/write_manifest.c b/src/bin/pg_combinebackup/write_manifest.c new file mode 100644 index 00000000000..82160134d8b --- /dev/null +++ b/src/bin/pg_combinebackup/write_manifest.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * Write a new backup manifest. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/write_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include <fcntl.h> +#include <time.h> +#include <unistd.h> + +#include "common/checksum_helper.h" +#include "common/file_perm.h" +#include "common/logging.h" +#include "lib/stringinfo.h" +#include "load_manifest.h" +#include "mb/pg_wchar.h" +#include "write_manifest.h" + +struct manifest_writer +{ + char pathname[MAXPGPATH]; + int fd; + StringInfoData buf; + bool first_file; + bool still_checksumming; + pg_checksum_context manifest_ctx; +}; + +static void escape_json(StringInfo buf, const char *str); +static void flush_manifest(manifest_writer *mwriter); +static size_t hex_encode(const uint8 *src, size_t len, char *dst); + +/* + * Create a new backup manifest writer. + * + * The backup manifest will be written into a file named backup_manifest + * in the specified directory. + */ +manifest_writer * +create_manifest_writer(char *directory) +{ + manifest_writer *mwriter = pg_malloc(sizeof(manifest_writer)); + + snprintf(mwriter->pathname, MAXPGPATH, "%s/backup_manifest", directory); + mwriter->fd = -1; + initStringInfo(&mwriter->buf); + mwriter->first_file = true; + mwriter->still_checksumming = true; + pg_checksum_init(&mwriter->manifest_ctx, CHECKSUM_TYPE_SHA256); + + appendStringInfo(&mwriter->buf, + "{ \"PostgreSQL-Backup-Manifest-Version\": 1,\n" + "\"Files\": ["); + + return mwriter; +} + +/* + * Add an entry for a file to a backup manifest. + * + * This is very similar to the backend's AddFileToBackupManifest, but + * various adjustments are required due to frontend/backend differences + * and other details. + */ +void +add_file_to_manifest(manifest_writer *mwriter, const char *manifest_path, + size_t size, pg_time_t mtime, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload) +{ + int pathlen = strlen(manifest_path); + + if (mwriter->first_file) + { + appendStringInfoChar(&mwriter->buf, '\n'); + mwriter->first_file = false; + } + else + appendStringInfoString(&mwriter->buf, ",\n"); + + if (pg_encoding_verifymbstr(PG_UTF8, manifest_path, pathlen) == pathlen) + { + appendStringInfoString(&mwriter->buf, "{ \"Path\": "); + escape_json(&mwriter->buf, manifest_path); + appendStringInfoString(&mwriter->buf, ", "); + } + else + { + appendStringInfoString(&mwriter->buf, "{ \"Encoded-Path\": \""); + enlargeStringInfo(&mwriter->buf, 2 * pathlen); + mwriter->buf.len += hex_encode((const uint8 *) manifest_path, pathlen, + &mwriter->buf.data[mwriter->buf.len]); + appendStringInfoString(&mwriter->buf, "\", "); + } + + appendStringInfo(&mwriter->buf, "\"Size\": %zu, ", size); + + appendStringInfoString(&mwriter->buf, "\"Last-Modified\": \""); + enlargeStringInfo(&mwriter->buf, 128); + mwriter->buf.len += strftime(&mwriter->buf.data[mwriter->buf.len], 128, + "%Y-%m-%d %H:%M:%S %Z", + gmtime(&mtime)); + appendStringInfoChar(&mwriter->buf, '"'); + + if (mwriter->buf.len > 128 * 1024) + flush_manifest(mwriter); + + if (checksum_length > 0) + { + appendStringInfo(&mwriter->buf, + ", \"Checksum-Algorithm\": \"%s\", \"Checksum\": \"", + pg_checksum_type_name(checksum_type)); + + enlargeStringInfo(&mwriter->buf, 2 * checksum_length); + mwriter->buf.len += hex_encode(checksum_payload, checksum_length, + &mwriter->buf.data[mwriter->buf.len]); + + appendStringInfoChar(&mwriter->buf, '"'); + } + + appendStringInfoString(&mwriter->buf, " }"); + + if (mwriter->buf.len > 128 * 1024) + flush_manifest(mwriter); +} + +/* + * Finalize the backup_manifest. + */ +void +finalize_manifest(manifest_writer *mwriter, + manifest_wal_range *first_wal_range) +{ + uint8 checksumbuf[PG_SHA256_DIGEST_LENGTH]; + int len; + manifest_wal_range *wal_range; + + /* Terminate the list of files. */ + appendStringInfoString(&mwriter->buf, "\n],\n"); + + /* Start a list of LSN ranges. */ + appendStringInfoString(&mwriter->buf, "\"WAL-Ranges\": [\n"); + + for (wal_range = first_wal_range; wal_range != NULL; + wal_range = wal_range->next) + appendStringInfo(&mwriter->buf, + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + wal_range == first_wal_range ? "" : ",\n", + wal_range->tli, + LSN_FORMAT_ARGS(wal_range->start_lsn), + LSN_FORMAT_ARGS(wal_range->end_lsn)); + + /* Terminate the list of WAL ranges. */ + appendStringInfoString(&mwriter->buf, "\n],\n"); + + /* Flush accumulated data and update checksum calculation. */ + flush_manifest(mwriter); + + /* Checksum only includes data up to this point. */ + mwriter->still_checksumming = false; + + /* Compute and insert manifest checksum. */ + appendStringInfoString(&mwriter->buf, "\"Manifest-Checksum\": \""); + enlargeStringInfo(&mwriter->buf, 2 * PG_SHA256_DIGEST_STRING_LENGTH); + len = pg_checksum_final(&mwriter->manifest_ctx, checksumbuf); + Assert(len == PG_SHA256_DIGEST_LENGTH); + mwriter->buf.len += + hex_encode(checksumbuf, len, &mwriter->buf.data[mwriter->buf.len]); + appendStringInfoString(&mwriter->buf, "\"}\n"); + + /* Flush the last manifest checksum itself. */ + flush_manifest(mwriter); + + /* Close the file. */ + if (close(mwriter->fd) != 0) + pg_fatal("could not close \"%s\": %m", mwriter->pathname); + mwriter->fd = -1; +} + +/* + * Produce a JSON string literal, properly escaping characters in the text. + */ +static void +escape_json(StringInfo buf, const char *str) +{ + const char *p; + + appendStringInfoCharMacro(buf, '"'); + for (p = str; *p; p++) + { + switch (*p) + { + case '\b': + appendStringInfoString(buf, "\\b"); + break; + case '\f': + appendStringInfoString(buf, "\\f"); + break; + case '\n': + appendStringInfoString(buf, "\\n"); + break; + case '\r': + appendStringInfoString(buf, "\\r"); + break; + case '\t': + appendStringInfoString(buf, "\\t"); + break; + case '"': + appendStringInfoString(buf, "\\\""); + break; + case '\\': + appendStringInfoString(buf, "\\\\"); + break; + default: + if ((unsigned char) *p < ' ') + appendStringInfo(buf, "\\u%04x", (int) *p); + else + appendStringInfoCharMacro(buf, *p); + break; + } + } + appendStringInfoCharMacro(buf, '"'); +} + +/* + * Flush whatever portion of the backup manifest we have generated and + * buffered in memory out to a file on disk. + * + * The first call to this function will create the file. After that, we + * keep it open and just append more data. + */ +static void +flush_manifest(manifest_writer *mwriter) +{ + char pathname[MAXPGPATH]; + + if (mwriter->fd == -1 && + (mwriter->fd = open(mwriter->pathname, + O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", mwriter->pathname); + + if (mwriter->buf.len > 0) + { + ssize_t wb; + + wb = write(mwriter->fd, mwriter->buf.data, mwriter->buf.len); + if (wb != mwriter->buf.len) + { + if (wb < 0) + pg_fatal("could not write \"%s\": %m", mwriter->pathname); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + pathname, (int) wb, mwriter->buf.len); + } + + if (mwriter->still_checksumming) + pg_checksum_update(&mwriter->manifest_ctx, + (uint8 *) mwriter->buf.data, + mwriter->buf.len); + resetStringInfo(&mwriter->buf); + } +} + +/* + * Encode bytes using two hexademical digits for each one. + */ +static size_t +hex_encode(const uint8 *src, size_t len, char *dst) +{ + const uint8 *end = src + len; + + while (src < end) + { + unsigned n1 = (*src >> 4) & 0xF; + unsigned n2 = *src & 0xF; + + *dst++ = n1 < 10 ? '0' + n1 : 'a' + n1 - 10; + *dst++ = n2 < 10 ? '0' + n2 : 'a' + n2 - 10; + ++src; + } + + return len * 2; +} diff --git a/src/bin/pg_combinebackup/write_manifest.h b/src/bin/pg_combinebackup/write_manifest.h new file mode 100644 index 00000000000..8fd7fe02c87 --- /dev/null +++ b/src/bin/pg_combinebackup/write_manifest.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * Write a new backup manifest. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/write_manifest.h + * + *------------------------------------------------------------------------- + */ +#ifndef WRITE_MANIFEST_H +#define WRITE_MANIFEST_H + +#include "common/checksum_helper.h" +#include "pgtime.h" + +struct manifest_wal_range; + +struct manifest_writer; +typedef struct manifest_writer manifest_writer; + +extern manifest_writer *create_manifest_writer(char *directory); +extern void add_file_to_manifest(manifest_writer *mwriter, + const char *manifest_path, + size_t size, pg_time_t mtime, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +extern void finalize_manifest(manifest_writer *mwriter, + struct manifest_wal_range *first_wal_range); + +#endif /* WRITE_MANIFEST_H */ diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 3ae3fc06df2..5407f51a4e7 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -85,6 +85,7 @@ static void RewriteControlFile(void); static void FindEndOfXLOG(void); static void KillExistingXLOG(void); static void KillExistingArchiveStatus(void); +static void KillExistingWALSummaries(void); static void WriteEmptyXLOG(void); static void usage(void); @@ -493,6 +494,7 @@ main(int argc, char *argv[]) RewriteControlFile(); KillExistingXLOG(); KillExistingArchiveStatus(); + KillExistingWALSummaries(); WriteEmptyXLOG(); printf(_("Write-ahead log reset\n")); @@ -1034,6 +1036,40 @@ KillExistingArchiveStatus(void) pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); } +/* + * Remove existing WAL summary files + */ +static void +KillExistingWALSummaries(void) +{ +#define WALSUMMARYDIR XLOGDIR "/summaries" +#define WALSUMMARY_NHEXCHARS 40 + + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(WALSUMMARYDIR)]; + + xldir = opendir(WALSUMMARYDIR); + if (xldir == NULL) + pg_fatal("could not open directory \"%s\": %m", WALSUMMARYDIR); + + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + if (strspn(xlde->d_name, "0123456789ABCDEF") == WALSUMMARY_NHEXCHARS && + strcmp(xlde->d_name + WALSUMMARY_NHEXCHARS, ".summary") == 0) + { + snprintf(path, sizeof(path), "%s/%s", WALSUMMARYDIR, xlde->d_name); + if (unlink(path) < 0) + pg_fatal("could not delete file \"%s\": %m", path); + } + } + + if (errno) + pg_fatal("could not read directory \"%s\": %m", WALSUMMARYDIR); + + if (closedir(xldir)) + pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); +} /* * Write an empty XLOG file, containing only the checkpoint record diff --git a/src/include/access/xlogbackup.h b/src/include/access/xlogbackup.h index 1611358137b..90e04cad569 100644 --- a/src/include/access/xlogbackup.h +++ b/src/include/access/xlogbackup.h @@ -28,6 +28,8 @@ typedef struct BackupState XLogRecPtr checkpointloc; /* last checkpoint location */ pg_time_t starttime; /* backup start time */ bool started_in_recovery; /* backup started in recovery? */ + XLogRecPtr istartpoint; /* incremental based on backup at this LSN */ + TimeLineID istarttli; /* incremental based on backup on this TLI */ /* Fields saved at the end of backup */ XLogRecPtr stoppoint; /* backup stop WAL location */ diff --git a/src/include/backup/basebackup.h b/src/include/backup/basebackup.h index 1432d9c206b..345bd22534c 100644 --- a/src/include/backup/basebackup.h +++ b/src/include/backup/basebackup.h @@ -34,6 +34,9 @@ typedef struct int64 size; /* total size as sent; -1 if not known */ } tablespaceinfo; -extern void SendBaseBackup(BaseBackupCmd *cmd); +struct IncrementalBackupInfo; + +extern void SendBaseBackup(BaseBackupCmd *cmd, + struct IncrementalBackupInfo *ib); #endif /* _BASEBACKUP_H */ diff --git a/src/include/backup/basebackup_incremental.h b/src/include/backup/basebackup_incremental.h new file mode 100644 index 00000000000..de99117599e --- /dev/null +++ b/src/include/backup/basebackup_incremental.h @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------- + * + * basebackup_incremental.h + * API for incremental backup support + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/include/backup/basebackup_incremental.h + * + *------------------------------------------------------------------------- + */ +#ifndef BASEBACKUP_INCREMENTAL_H +#define BASEBACKUP_INCREMENTAL_H + +#include "access/xlogbackup.h" +#include "common/relpath.h" +#include "storage/block.h" +#include "utils/palloc.h" + +#define INCREMENTAL_MAGIC 0xd3ae1f0d + +typedef enum +{ + BACK_UP_FILE_FULLY, + BACK_UP_FILE_INCREMENTALLY +} FileBackupMethod; + +struct IncrementalBackupInfo; +typedef struct IncrementalBackupInfo IncrementalBackupInfo; + +extern IncrementalBackupInfo *CreateIncrementalBackupInfo(MemoryContext); + +extern void AppendIncrementalManifestData(IncrementalBackupInfo *ib, + const char *data, + int len); +extern void FinalizeIncrementalManifest(IncrementalBackupInfo *ib); + +extern void PrepareForIncrementalBackup(IncrementalBackupInfo *ib, + BackupState *backup_state); + +extern char *GetIncrementalFilePath(Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, + ForkNumber forknum, unsigned segno); +extern FileBackupMethod GetFileBackupMethod(IncrementalBackupInfo *ib, + const char *path, + Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, + ForkNumber forknum, + unsigned segno, size_t size, + unsigned *num_blocks_required, + BlockNumber *relative_block_numbers, + unsigned *truncation_block_length); +extern size_t GetIncrementalFileSize(unsigned num_blocks_required); + +#endif diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h index 5142a087291..c98961c329f 100644 --- a/src/include/nodes/replnodes.h +++ b/src/include/nodes/replnodes.h @@ -108,4 +108,13 @@ typedef struct TimeLineHistoryCmd TimeLineID timeline; } TimeLineHistoryCmd; +/* ---------------------- + * UPLOAD_MANIFEST command + * ---------------------- + */ +typedef struct UploadManifestCmd +{ + NodeTag type; +} UploadManifestCmd; + #endif /* REPLNODES_H */ diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index a020377761d..46cb2a65500 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -779,6 +779,10 @@ a tar-format backup, pass the name of the tar program to use in the keyword parameter tar_program. Note that tablespace tar files aren't handled here. +To restore from an incremental backup, pass the parameter combine_with_prior +as a reference to an array of prior backup names with which this backup +is to be combined using pg_combinebackup. + Streaming replication can be enabled on this node by passing the keyword parameter has_streaming => 1. This is disabled by default. @@ -816,7 +820,22 @@ sub init_from_backup mkdir $self->archive_dir; my $data_path = $self->data_dir; - if (defined $params{tar_program}) + if (defined $params{combine_with_prior}) + { + my @prior_backups = @{$params{combine_with_prior}}; + my @prior_backup_path; + + for my $prior_backup_name (@prior_backups) + { + push @prior_backup_path, + $root_node->backup_dir . '/' . $prior_backup_name; + } + + local %ENV = $self->_get_env(); + PostgreSQL::Test::Utils::system_or_bail('pg_combinebackup', '-d', + @prior_backup_path, $backup_path, '-o', $data_path); + } + elsif (defined $params{tar_program}) { mkdir($data_path); PostgreSQL::Test::Utils::system_or_bail($params{tar_program}, 'xf', diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 93900493142..e37ef9aa76d 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4023,3 +4023,15 @@ SummarizerReadLocalXLogPrivate WalSummarizerData WalSummaryFile WalSummaryIO +FileBackupMethod +IncrementalBackupInfo +UploadManifestCmd +backup_file_entry +backup_wal_range +cb_cleanup_dir +cb_options +cb_tablespace +cb_tablespace_mapping +manifest_data +manifest_writer +rfile |