diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2018-09-14 17:31:51 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2018-09-14 17:31:51 -0400 |
commit | 548e50976ce721b5e927d42a105c2f05b51b52a6 (patch) | |
tree | 9d492f63d5f8715d72c903448467623fb1dd737a /src/bin/pg_dump/pg_backup_custom.c | |
parent | 20bef2c3110af295501919bac463b87ac58876de (diff) | |
download | postgresql-548e50976ce721b5e927d42a105c2f05b51b52a6.tar.gz postgresql-548e50976ce721b5e927d42a105c2f05b51b52a6.zip |
Improve parallel scheduling logic in pg_dump/pg_restore.
Previously, the way this worked was that a parallel pg_dump would
re-order the TABLE_DATA items in the dump's TOC into decreasing size
order, and separately re-order (some of) the INDEX items into decreasing
size order. Then pg_dump would dump the items in that order. Later,
parallel pg_restore just followed the TOC order. This method had lots
of deficiencies:
* TOC ordering randomly differed between parallel and non-parallel
dumps, and was hard to predict in the former case, causing problems
for building stable pg_dump test cases.
* Parallel restore only followed a well-chosen order if the dump had
been done in parallel; in particular, this never happened for restore
from custom-format dumps.
* The best order for restore isn't necessarily the same as for dump,
and it's not really static either because of locking considerations.
* TABLE_DATA and INDEX items aren't the only things that might take a lot
of work during restore. Scheduling was particularly stupid for the BLOBS
item, which might require lots of work during dump as well as restore,
but was left to the end in either case.
This patch removes the logic that changed the TOC order, fixing the
test instability problem. Instead, we sort the parallelizable items
just before processing them during a parallel dump. Independently
of that, parallel restore prioritizes the ready-to-execute tasks
based on the size of the underlying table. In the case of dependent
tasks such as index, constraint, or foreign key creation, the largest
relevant table is used as the metric for estimating the task length.
(This is pretty crude, but it should be enough to avoid the case we
want to avoid, which is ending the run with just a few large tasks
such that we can't make use of all N workers.)
Patch by me, responding to a complaint from Peter Eisentraut,
who also reviewed the patch.
Discussion: https://postgr.es/m/5137fe12-d0a2-4971-61b6-eb4e7e8875f8@2ndquadrant.com
Diffstat (limited to 'src/bin/pg_dump/pg_backup_custom.c')
-rw-r--r-- | src/bin/pg_dump/pg_backup_custom.c | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index ad18a6c684b..96f44e88b11 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -59,6 +59,8 @@ static void _StartBlob(ArchiveHandle *AH, TocEntry *te, Oid oid); static void _EndBlob(ArchiveHandle *AH, TocEntry *te, Oid oid); static void _EndBlobs(ArchiveHandle *AH, TocEntry *te); static void _LoadBlobs(ArchiveHandle *AH, bool drop); + +static void _PrepParallelRestore(ArchiveHandle *AH); static void _Clone(ArchiveHandle *AH); static void _DeClone(ArchiveHandle *AH); @@ -129,6 +131,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) AH->StartBlobPtr = _StartBlob; AH->EndBlobPtr = _EndBlob; AH->EndBlobsPtr = _EndBlobs; + + AH->PrepParallelRestorePtr = _PrepParallelRestore; AH->ClonePtr = _Clone; AH->DeClonePtr = _DeClone; @@ -776,6 +780,66 @@ _ReopenArchive(ArchiveHandle *AH) } /* + * Prepare for parallel restore. + * + * The main thing that needs to happen here is to fill in TABLE DATA and BLOBS + * TOC entries' dataLength fields with appropriate values to guide the + * ordering of restore jobs. The source of said data is format-dependent, + * as is the exact meaning of the values. + * + * A format module might also choose to do other setup here. + */ +static void +_PrepParallelRestore(ArchiveHandle *AH) +{ + lclContext *ctx = (lclContext *) AH->formatData; + TocEntry *prev_te = NULL; + lclTocEntry *prev_tctx = NULL; + TocEntry *te; + + /* + * Knowing that the data items were dumped out in TOC order, we can + * reconstruct the length of each item as the delta to the start offset of + * the next data item. + */ + for (te = AH->toc->next; te != AH->toc; te = te->next) + { + lclTocEntry *tctx = (lclTocEntry *) te->formatData; + + /* + * Ignore entries without a known data offset; if we were unable to + * seek to rewrite the TOC when creating the archive, this'll be all + * of them, and we'll end up with no size estimates. + */ + if (tctx->dataState != K_OFFSET_POS_SET) + continue; + + /* Compute previous data item's length */ + if (prev_te) + { + if (tctx->dataPos > prev_tctx->dataPos) + prev_te->dataLength = tctx->dataPos - prev_tctx->dataPos; + } + + prev_te = te; + prev_tctx = tctx; + } + + /* If OK to seek, we can determine the length of the last item */ + if (prev_te && ctx->hasSeek) + { + pgoff_t endpos; + + if (fseeko(AH->FH, 0, SEEK_END) != 0) + exit_horribly(modulename, "error during file seek: %s\n", + strerror(errno)); + endpos = ftello(AH->FH); + if (endpos > prev_tctx->dataPos) + prev_te->dataLength = endpos - prev_tctx->dataPos; + } +} + +/* * Clone format-specific fields during parallel restoration. */ static void |