aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2016-12-22 18:45:00 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2016-12-22 18:45:00 +0200
commit01ec25631fe0eae6af67c29c61a358dc6b92ed3c (patch)
tree36f4d174b81394546cb8c7c173d1e58617491321
parentb86515da1a73d0a2e23aca728f18b5f9e809e89f (diff)
downloadpostgresql-01ec25631fe0eae6af67c29c61a358dc6b92ed3c.tar.gz
postgresql-01ec25631fe0eae6af67c29c61a358dc6b92ed3c.zip
Simplify tape block format.
No more indirect blocks. The blocks form a linked list instead. This saves some memory, because we don't need to have a buffer in memory to hold the indirect block (or blocks). To reflect that, TAPE_BUFFER_OVERHEAD is reduced from 3 to 1 buffer, which allows using more memory for building the initial runs. Reviewed by Peter Geoghegan and Robert Haas. Discussion: https://www.postgresql.org/message-id/34678beb-938e-646e-db9f-a7def5c44ada%40iki.fi
-rw-r--r--src/backend/utils/sort/logtape.c627
-rw-r--r--src/backend/utils/sort/tuplesort.c65
-rw-r--r--src/include/utils/logtape.h4
3 files changed, 218 insertions, 478 deletions
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 316301061de..623f0e01d81 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -31,15 +31,8 @@
* in BLCKSZ-size blocks. Space allocation boils down to keeping track
* of which blocks in the underlying file belong to which logical tape,
* plus any blocks that are free (recycled and not yet reused).
- * The blocks in each logical tape are remembered using a method borrowed
- * from the Unix HFS filesystem: we store data block numbers in an
- * "indirect block". If an indirect block fills up, we write it out to
- * the underlying file and remember its location in a second-level indirect
- * block. In the same way second-level blocks are remembered in third-
- * level blocks, and so on if necessary (of course we're talking huge
- * amounts of data here). The topmost indirect block of a given logical
- * tape is never actually written out to the physical file, but all lower-
- * level indirect blocks will be.
+ * The blocks in each logical tape form a chain, with a prev- and next-
+ * pointer in each block.
*
* The initial write pass is guaranteed to fill the underlying file
* perfectly sequentially, no matter how data is divided into logical tapes.
@@ -87,58 +80,65 @@
#include "utils/memutils.h"
/*
- * Block indexes are "long"s, so we can fit this many per indirect block.
- * NB: we assume this is an exact fit!
- */
-#define BLOCKS_PER_INDIR_BLOCK ((int) (BLCKSZ / sizeof(long)))
-
-/*
- * We use a struct like this for each active indirection level of each
- * logical tape. If the indirect block is not the highest level of its
- * tape, the "nextup" link points to the next higher level. Only the
- * "ptrs" array is written out if we have to dump the indirect block to
- * disk. If "ptrs" is not completely full, we store -1L in the first
- * unused slot at completion of the write phase for the logical tape.
+ * A TapeBlockTrailer is stored at the end of each BLCKSZ block.
+ *
+ * The first block of a tape has prev == -1. The last block of a tape
+ * stores the number of valid bytes on the block, inverted, in 'next'
+ * Therefore next < 0 indicates the last block.
*/
-typedef struct IndirectBlock
+typedef struct TapeBlockTrailer
{
- int nextSlot; /* next pointer slot to write or read */
- struct IndirectBlock *nextup; /* parent indirect level, or NULL if
- * top */
- long ptrs[BLOCKS_PER_INDIR_BLOCK]; /* indexes of contained blocks */
-} IndirectBlock;
+ long prev; /* previous block on this tape, or -1 on first
+ * block */
+ long next; /* next block on this tape, or # of valid
+ * bytes on last block (if < 0) */
+} TapeBlockTrailer;
+
+#define TapeBlockPayloadSize (BLCKSZ - sizeof(TapeBlockTrailer))
+#define TapeBlockGetTrailer(buf) \
+ ((TapeBlockTrailer *) ((char *) buf + TapeBlockPayloadSize))
+
+#define TapeBlockIsLast(buf) (TapeBlockGetTrailer(buf)->next < 0)
+#define TapeBlockGetNBytes(buf) \
+ (TapeBlockIsLast(buf) ? \
+ (- TapeBlockGetTrailer(buf)->next) : TapeBlockPayloadSize)
+#define TapeBlockSetNBytes(buf, nbytes) \
+ (TapeBlockGetTrailer(buf)->next = -(nbytes))
+
/*
* This data structure represents a single "logical tape" within the set
- * of logical tapes stored in the same file. We must keep track of the
- * current partially-read-or-written data block as well as the active
- * indirect block level(s).
+ * of logical tapes stored in the same file.
+ *
+ * While writing, we hold the current partially-written data block in the
+ * buffer. While reading, we can hold multiple blocks in the buffer. Note
+ * that we don't retain the trailers of a block when it's read into the
+ * buffer. The buffer therefore contains one large contiguous chunk of data
+ * from the tape.
*/
typedef struct LogicalTape
{
- IndirectBlock *indirect; /* bottom of my indirect-block hierarchy */
bool writing; /* T while in write phase */
bool frozen; /* T if blocks should not be freed when read */
bool dirty; /* does buffer need to be written? */
/*
- * The total data volume in the logical tape is numFullBlocks * BLCKSZ +
- * lastBlockBytes. BUT: we do not update lastBlockBytes during writing,
- * only at completion of a write phase.
+ * Block numbers of the first, current, and next block of the tape.
+ *
+ * The "current" block number is only valid when writing, or reading from
+ * a frozen tape. (When reading from an unfrozen tape, we use a larger
+ * read buffer that holds multiple blocks, so the "current" block is
+ * ambiguous.)
*/
- long numFullBlocks; /* number of complete blocks in log tape */
- int lastBlockBytes; /* valid bytes in last (incomplete) block */
+ long firstBlockNumber;
+ long curBlockNumber;
+ long nextBlockNumber;
/*
- * Buffer for current data block. Note we don't bother to store the
- * actual file block number of the data block (during the write phase it
- * hasn't been assigned yet, and during read we don't care anymore). But
- * we do need the relative block number so we can detect end-of-tape while
- * reading.
+ * Buffer for current data block(s).
*/
char *buffer; /* physical buffer (separately palloc'd) */
int buffer_size; /* allocated size of the buffer */
- long curBlockNumber; /* this block's logical blk# within tape */
int pos; /* next read/write position in buffer */
int nbytes; /* total # of valid bytes in buffer */
} LogicalTape;
@@ -182,19 +182,6 @@ static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
static long ltsGetFreeBlock(LogicalTapeSet *lts);
static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
-static void ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
- long blocknum);
-static long ltsRewindIndirectBlock(LogicalTapeSet *lts,
- IndirectBlock *indirect,
- bool freezing);
-static long ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts,
- IndirectBlock *indirect);
-static long ltsRecallNextBlockNum(LogicalTapeSet *lts,
- IndirectBlock *indirect,
- bool frozen);
-static long ltsRecallPrevBlockNum(LogicalTapeSet *lts,
- IndirectBlock *indirect);
-static void ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt);
/*
@@ -237,46 +224,40 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
/*
* Read as many blocks as we can into the per-tape buffer.
*
- * The caller can specify the next physical block number to read, in
- * datablocknum, or -1 to fetch the next block number from the internal block.
- * If datablocknum == -1, the caller must've already set curBlockNumber.
- *
* Returns true if anything was read, 'false' on EOF.
*/
static bool
-ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt, long datablocknum)
+ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
{
lt->pos = 0;
lt->nbytes = 0;
do
{
- /* Fetch next block number (unless provided by caller) */
- if (datablocknum == -1)
- {
- datablocknum = ltsRecallNextBlockNum(lts, lt->indirect, lt->frozen);
- if (datablocknum == -1L)
- break; /* EOF */
- lt->curBlockNumber++;
- }
+ char *thisbuf = lt->buffer + lt->nbytes;
+
+ /* Fetch next block number */
+ if (lt->nextBlockNumber == -1L)
+ break; /* EOF */
/* Read the block */
- ltsReadBlock(lts, datablocknum, (void *) (lt->buffer + lt->nbytes));
+ ltsReadBlock(lts, lt->nextBlockNumber, (void *) thisbuf);
if (!lt->frozen)
- ltsReleaseBlock(lts, datablocknum);
+ ltsReleaseBlock(lts, lt->nextBlockNumber);
+ lt->curBlockNumber = lt->nextBlockNumber;
- if (lt->curBlockNumber < lt->numFullBlocks)
- lt->nbytes += BLCKSZ;
- else
+ lt->nbytes += TapeBlockGetNBytes(thisbuf);
+ if (TapeBlockIsLast(thisbuf))
{
+ lt->nextBlockNumber = -1L;
/* EOF */
- lt->nbytes += lt->lastBlockBytes;
break;
}
+ else
+ lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next;
/* Advance to next block, if we have buffer space left */
- datablocknum = -1;
- } while (lt->nbytes < lt->buffer_size);
+ } while (lt->buffer_size - lt->nbytes > BLCKSZ);
return (lt->nbytes > 0);
}
@@ -361,203 +342,6 @@ ltsReleaseBlock(LogicalTapeSet *lts, long blocknum)
}
/*
- * These routines manipulate indirect-block hierarchies. All are recursive
- * so that they don't have any specific limit on the depth of hierarchy.
- */
-
-/*
- * Record a data block number in a logical tape's lowest indirect block,
- * or record an indirect block's number in the next higher indirect level.
- */
-static void
-ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
- long blocknum)
-{
- if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK)
- {
- /*
- * This indirect block is full, so dump it out and recursively save
- * its address in the next indirection level. Create a new
- * indirection level if there wasn't one before.
- */
- long indirblock = ltsGetFreeBlock(lts);
-
- ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
- if (indirect->nextup == NULL)
- {
- indirect->nextup = (IndirectBlock *) palloc(sizeof(IndirectBlock));
- indirect->nextup->nextSlot = 0;
- indirect->nextup->nextup = NULL;
- }
- ltsRecordBlockNum(lts, indirect->nextup, indirblock);
-
- /*
- * Reset to fill another indirect block at this level.
- */
- indirect->nextSlot = 0;
- }
- indirect->ptrs[indirect->nextSlot++] = blocknum;
-}
-
-/*
- * Reset a logical tape's indirect-block hierarchy after a write pass
- * to prepare for reading. We dump out partly-filled blocks except
- * at the top of the hierarchy, and we rewind each level to the start.
- * This call returns the first data block number, or -1L if the tape
- * is empty.
- *
- * Unless 'freezing' is true, release indirect blocks to the free pool after
- * reading them.
- */
-static long
-ltsRewindIndirectBlock(LogicalTapeSet *lts,
- IndirectBlock *indirect,
- bool freezing)
-{
- /* Handle case of never-written-to tape */
- if (indirect == NULL)
- return -1L;
-
- /* Insert sentinel if block is not full */
- if (indirect->nextSlot < BLOCKS_PER_INDIR_BLOCK)
- indirect->ptrs[indirect->nextSlot] = -1L;
-
- /*
- * If block is not topmost, write it out, and recurse to obtain address of
- * first block in this hierarchy level. Read that one in.
- */
- if (indirect->nextup != NULL)
- {
- long indirblock = ltsGetFreeBlock(lts);
-
- ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
- ltsRecordBlockNum(lts, indirect->nextup, indirblock);
- indirblock = ltsRewindIndirectBlock(lts, indirect->nextup, freezing);
- Assert(indirblock != -1L);
- ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
- if (!freezing)
- ltsReleaseBlock(lts, indirblock);
- }
-
- /*
- * Reset my next-block pointer, and then fetch a block number if any.
- */
- indirect->nextSlot = 0;
- if (indirect->ptrs[0] == -1L)
- return -1L;
- return indirect->ptrs[indirect->nextSlot++];
-}
-
-/*
- * Rewind a previously-frozen indirect-block hierarchy for another read pass.
- * This call returns the first data block number, or -1L if the tape
- * is empty.
- */
-static long
-ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts,
- IndirectBlock *indirect)
-{
- /* Handle case of never-written-to tape */
- if (indirect == NULL)
- return -1L;
-
- /*
- * If block is not topmost, recurse to obtain address of first block in
- * this hierarchy level. Read that one in.
- */
- if (indirect->nextup != NULL)
- {
- long indirblock;
-
- indirblock = ltsRewindFrozenIndirectBlock(lts, indirect->nextup);
- Assert(indirblock != -1L);
- ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
- }
-
- /*
- * Reset my next-block pointer, and then fetch a block number if any.
- */
- indirect->nextSlot = 0;
- if (indirect->ptrs[0] == -1L)
- return -1L;
- return indirect->ptrs[indirect->nextSlot++];
-}
-
-/*
- * Obtain next data block number in the forward direction, or -1L if no more.
- *
- * Unless 'frozen' is true, release indirect blocks to the free pool after
- * reading them.
- */
-static long
-ltsRecallNextBlockNum(LogicalTapeSet *lts,
- IndirectBlock *indirect,
- bool frozen)
-{
- /* Handle case of never-written-to tape */
- if (indirect == NULL)
- return -1L;
-
- if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK ||
- indirect->ptrs[indirect->nextSlot] == -1L)
- {
- long indirblock;
-
- if (indirect->nextup == NULL)
- return -1L; /* nothing left at this level */
- indirblock = ltsRecallNextBlockNum(lts, indirect->nextup, frozen);
- if (indirblock == -1L)
- return -1L; /* nothing left at this level */
- ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
- if (!frozen)
- ltsReleaseBlock(lts, indirblock);
- indirect->nextSlot = 0;
- }
- if (indirect->ptrs[indirect->nextSlot] == -1L)
- return -1L;
- return indirect->ptrs[indirect->nextSlot++];
-}
-
-/*
- * Obtain next data block number in the reverse direction, or -1L if no more.
- *
- * Note this fetches the block# before the one last returned, no matter which
- * direction of call returned that one. If we fail, no change in state.
- *
- * This routine can only be used in 'frozen' state, so there's no need to
- * pass a parameter telling whether to release blocks ... we never do.
- */
-static long
-ltsRecallPrevBlockNum(LogicalTapeSet *lts,
- IndirectBlock *indirect)
-{
- /* Handle case of never-written-to tape */
- if (indirect == NULL)
- return -1L;
-
- if (indirect->nextSlot <= 1)
- {
- long indirblock;
-
- if (indirect->nextup == NULL)
- return -1L; /* nothing left at this level */
- indirblock = ltsRecallPrevBlockNum(lts, indirect->nextup);
- if (indirblock == -1L)
- return -1L; /* nothing left at this level */
- ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
-
- /*
- * The previous block would only have been written out if full, so we
- * need not search it for a -1 sentinel.
- */
- indirect->nextSlot = BLOCKS_PER_INDIR_BLOCK + 1;
- }
- indirect->nextSlot--;
- return indirect->ptrs[indirect->nextSlot - 1];
-}
-
-
-/*
* Create a set of logical tapes in a temporary underlying file.
*
* Each tape is initialized in write state.
@@ -585,23 +369,21 @@ LogicalTapeSetCreate(int ntapes)
lts->nTapes = ntapes;
/*
- * Initialize per-tape structs. Note we allocate the I/O buffer and
- * first-level indirect block for a tape only when it is first actually
- * written to. This avoids wasting memory space when tuplesort.c
- * overestimates the number of tapes needed.
+ * Initialize per-tape structs. Note we allocate the I/O buffer and the
+ * first block for a tape only when it is first actually written to. This
+ * avoids wasting memory space when tuplesort.c overestimates the number
+ * of tapes needed.
*/
for (i = 0; i < ntapes; i++)
{
lt = &lts->tapes[i];
- lt->indirect = NULL;
lt->writing = true;
lt->frozen = false;
lt->dirty = false;
- lt->numFullBlocks = 0L;
- lt->lastBlockBytes = 0;
+ lt->firstBlockNumber = -1L;
+ lt->curBlockNumber = -1L;
lt->buffer = NULL;
lt->buffer_size = 0;
- lt->curBlockNumber = 0L;
lt->pos = 0;
lt->nbytes = 0;
}
@@ -615,19 +397,12 @@ void
LogicalTapeSetClose(LogicalTapeSet *lts)
{
LogicalTape *lt;
- IndirectBlock *ib,
- *nextib;
int i;
BufFileClose(lts->pfile);
for (i = 0; i < lts->nTapes; i++)
{
lt = &lts->tapes[i];
- for (ib = lt->indirect; ib != NULL; ib = nextib)
- {
- nextib = ib->nextup;
- pfree(ib);
- }
if (lt->buffer)
pfree(lt->buffer);
}
@@ -651,21 +426,6 @@ LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
}
/*
- * Dump the dirty buffer of a logical tape.
- */
-static void
-ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt)
-{
- long datablock = ltsGetFreeBlock(lts);
-
- Assert(lt->dirty);
- ltsWriteBlock(lts, datablock, (void *) lt->buffer);
- ltsRecordBlockNum(lts, lt->indirect, datablock);
- lt->dirty = false;
- /* Caller must do other state update as needed */
-}
-
-/*
* Write to a logical tape.
*
* There are no error returns; we ereport() on failure.
@@ -681,39 +441,55 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
lt = &lts->tapes[tapenum];
Assert(lt->writing);
- /* Allocate data buffer and first indirect block on first write */
+ /* Allocate data buffer and first block on first write */
if (lt->buffer == NULL)
{
lt->buffer = (char *) palloc(BLCKSZ);
lt->buffer_size = BLCKSZ;
}
- if (lt->indirect == NULL)
+ if (lt->curBlockNumber == -1)
{
- lt->indirect = (IndirectBlock *) palloc(sizeof(IndirectBlock));
- lt->indirect->nextSlot = 0;
- lt->indirect->nextup = NULL;
+ Assert(lt->firstBlockNumber == -1);
+ Assert(lt->pos == 0);
+
+ lt->curBlockNumber = ltsGetFreeBlock(lts);
+ lt->firstBlockNumber = lt->curBlockNumber;
+
+ TapeBlockGetTrailer(lt->buffer)->prev = -1L;
}
Assert(lt->buffer_size == BLCKSZ);
while (size > 0)
{
- if (lt->pos >= BLCKSZ)
+ if (lt->pos >= TapeBlockPayloadSize)
{
/* Buffer full, dump it out */
- if (lt->dirty)
- ltsDumpBuffer(lts, lt);
- else
+ long nextBlockNumber;
+
+ if (!lt->dirty)
{
/* Hmm, went directly from reading to writing? */
elog(ERROR, "invalid logtape state: should be dirty");
}
- lt->numFullBlocks++;
- lt->curBlockNumber++;
+
+ /*
+ * First allocate the next block, so that we can store it in the
+ * 'next' pointer of this block.
+ */
+ nextBlockNumber = ltsGetFreeBlock(lts);
+
+ /* set the next-pointer and dump the current block. */
+ TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
+ ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+
+ /* initialize the prev-pointer of the next block */
+ TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
+ lt->curBlockNumber = nextBlockNumber;
lt->pos = 0;
lt->nbytes = 0;
}
- nthistime = BLCKSZ - lt->pos;
+ nthistime = TapeBlockPayloadSize - lt->pos;
if (nthistime > size)
nthistime = size;
Assert(nthistime > 0);
@@ -734,19 +510,17 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
*
* The tape must currently be in writing state, or "frozen" in read state.
*
- * 'buffer_size' specifies how much memory to use for the read buffer. It
- * does not include the memory needed for the indirect blocks. Regardless
- * of the argument, the actual amount of memory used is between BLCKSZ and
- * MaxAllocSize, and is a multiple of BLCKSZ. The given value is rounded
- * down and truncated to fit those constraints, if necessary. If the tape
- * is frozen, the 'buffer_size' argument is ignored, and a small BLCKSZ byte
- * buffer is used.
+ * 'buffer_size' specifies how much memory to use for the read buffer.
+ * Regardless of the argument, the actual amount of memory used is between
+ * BLCKSZ and MaxAllocSize, and is a multiple of BLCKSZ. The given value is
+ * rounded down and truncated to fit those constraints, if necessary. If the
+ * tape is frozen, the 'buffer_size' argument is ignored, and a small BLCKSZ
+ * byte buffer is used.
*/
void
LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
{
LogicalTape *lt;
- long datablocknum;
Assert(tapenum >= 0 && tapenum < lts->nTapes);
lt = &lts->tapes[tapenum];
@@ -776,14 +550,15 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
if (lt->writing)
{
/*
- * Completion of a write phase. Flush last partial data block, flush
- * any partial indirect blocks, rewind for normal (destructive) read.
+ * Completion of a write phase. Flush last partial data block, and
+ * rewind for normal (destructive) read.
*/
if (lt->dirty)
- ltsDumpBuffer(lts, lt);
- lt->lastBlockBytes = lt->nbytes;
+ {
+ TapeBlockSetNBytes(lt->buffer, lt->nbytes);
+ ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+ }
lt->writing = false;
- datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, false);
}
else
{
@@ -792,7 +567,6 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
* pass.
*/
Assert(lt->frozen);
- datablocknum = ltsRewindFrozenIndirectBlock(lts, lt->indirect);
}
/* Allocate a read buffer (unless the tape is empty) */
@@ -800,18 +574,17 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
pfree(lt->buffer);
lt->buffer = NULL;
lt->buffer_size = 0;
- if (datablocknum != -1L)
+ if (lt->firstBlockNumber != -1L)
{
lt->buffer = palloc(buffer_size);
lt->buffer_size = buffer_size;
}
/* Read the first block, or reset if tape is empty */
- lt->curBlockNumber = 0L;
+ lt->nextBlockNumber = lt->firstBlockNumber;
lt->pos = 0;
lt->nbytes = 0;
- if (datablocknum != -1L)
- ltsReadFillBuffer(lts, lt, datablocknum);
+ ltsReadFillBuffer(lts, lt);
}
/*
@@ -826,38 +599,21 @@ void
LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum)
{
LogicalTape *lt;
- IndirectBlock *ib,
- *nextib;
Assert(tapenum >= 0 && tapenum < lts->nTapes);
lt = &lts->tapes[tapenum];
Assert(!lt->writing && !lt->frozen);
- /* Must truncate the indirect-block hierarchy down to one level. */
- if (lt->indirect)
- {
- for (ib = lt->indirect->nextup; ib != NULL; ib = nextib)
- {
- nextib = ib->nextup;
- pfree(ib);
- }
- lt->indirect->nextSlot = 0;
- lt->indirect->nextup = NULL;
- }
lt->writing = true;
lt->dirty = false;
- lt->numFullBlocks = 0L;
- lt->lastBlockBytes = 0;
- lt->curBlockNumber = 0L;
+ lt->firstBlockNumber = -1L;
+ lt->curBlockNumber = -1L;
lt->pos = 0;
lt->nbytes = 0;
-
if (lt->buffer)
- {
pfree(lt->buffer);
- lt->buffer = NULL;
- lt->buffer_size = 0;
- }
+ lt->buffer = NULL;
+ lt->buffer_size = 0;
}
/*
@@ -882,7 +638,7 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
if (lt->pos >= lt->nbytes)
{
/* Try to load more data into buffer. */
- if (!ltsReadFillBuffer(lts, lt, -1))
+ if (!ltsReadFillBuffer(lts, lt))
break; /* EOF */
}
@@ -917,22 +673,23 @@ void
LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
{
LogicalTape *lt;
- long datablocknum;
Assert(tapenum >= 0 && tapenum < lts->nTapes);
lt = &lts->tapes[tapenum];
Assert(lt->writing);
/*
- * Completion of a write phase. Flush last partial data block, flush any
- * partial indirect blocks, rewind for nondestructive read.
+ * Completion of a write phase. Flush last partial data block, and rewind
+ * for nondestructive read.
*/
if (lt->dirty)
- ltsDumpBuffer(lts, lt);
- lt->lastBlockBytes = lt->nbytes;
+ {
+ TapeBlockSetNBytes(lt->buffer, lt->nbytes);
+ ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+ lt->writing = false;
+ }
lt->writing = false;
lt->frozen = true;
- datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, true);
/*
* The seek and backspace functions assume a single block read buffer.
@@ -950,15 +707,18 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
}
/* Read the first block, or reset if tape is empty */
- lt->curBlockNumber = 0L;
+ lt->curBlockNumber = lt->firstBlockNumber;
lt->pos = 0;
lt->nbytes = 0;
- if (datablocknum != -1L)
- {
- ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
- lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
- BLCKSZ : lt->lastBlockBytes;
- }
+
+ if (lt->firstBlockNumber == -1L)
+ lt->nextBlockNumber = -1L;
+ ltsReadBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+ if (TapeBlockIsLast(lt->buffer))
+ lt->nextBlockNumber = -1L;
+ else
+ lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
+ lt->nbytes = TapeBlockGetNBytes(lt->buffer);
}
/*
@@ -969,15 +729,16 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
* random access during write, and an unfrozen read tape may have
* already discarded the desired data!
*
- * Return value is TRUE if seek successful, FALSE if there isn't that much
- * data before the current point (in which case there's no state change).
+ * Returns the number of bytes backed up. It can be less than the
+ * requested amount, if there isn't that much data before the current
+ * position. The tape is positioned to the beginning of the tape in
+ * that case.
*/
-bool
+size_t
LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
{
LogicalTape *lt;
- long nblocks;
- int newpos;
+ size_t seekpos = 0;
Assert(tapenum >= 0 && tapenum < lts->nTapes);
lt = &lts->tapes[tapenum];
@@ -990,45 +751,50 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
if (size <= (size_t) lt->pos)
{
lt->pos -= (int) size;
- return true;
+ return size;
}
/*
- * Not-so-easy case. Figure out whether it's possible at all.
+ * Not-so-easy case, have to walk back the chain of blocks. This
+ * implementation would be pretty inefficient for long seeks, but we
+ * really aren't doing that (a seek over one tuple is typical).
*/
- size -= (size_t) lt->pos; /* part within this block */
- nblocks = size / BLCKSZ;
- size = size % BLCKSZ;
- if (size)
+ seekpos = (size_t) lt->pos; /* part within this block */
+ while (size > seekpos)
{
- nblocks++;
- newpos = (int) (BLCKSZ - size);
- }
- else
- newpos = 0;
- if (nblocks > lt->curBlockNumber)
- return false; /* a seek too far... */
-
- /*
- * OK, we need to back up nblocks blocks. This implementation would be
- * pretty inefficient for long seeks, but we really aren't expecting that
- * (a seek over one tuple is typical).
- */
- while (nblocks-- > 0)
- {
- long datablocknum = ltsRecallPrevBlockNum(lts, lt->indirect);
+ long prev = TapeBlockGetTrailer(lt->buffer)->prev;
- if (datablocknum == -1L)
- elog(ERROR, "unexpected end of tape");
- lt->curBlockNumber--;
- if (nblocks == 0)
+ if (prev == -1L)
{
- ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
- lt->nbytes = BLCKSZ;
+ /* Tried to back up beyond the beginning of tape. */
+ if (lt->curBlockNumber != lt->firstBlockNumber)
+ elog(ERROR, "unexpected end of tape");
+ lt->pos = 0;
+ return seekpos;
}
+
+ ltsReadBlock(lts, prev, (void *) lt->buffer);
+
+ if (TapeBlockGetTrailer(lt->buffer)->next != lt->curBlockNumber)
+ elog(ERROR, "broken tape, next of block %ld is %ld, expected %ld",
+ prev,
+ TapeBlockGetTrailer(lt->buffer)->next,
+ lt->curBlockNumber);
+
+ lt->nbytes = TapeBlockPayloadSize;
+ lt->curBlockNumber = prev;
+ lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
+
+ seekpos += TapeBlockPayloadSize;
}
- lt->pos = newpos;
- return true;
+
+ /*
+ * 'seekpos' can now be greater than 'size', because it points to the
+ * beginning the target block. The difference is the position within the
+ * page.
+ */
+ lt->pos = seekpos - size;
+ return size;
}
/*
@@ -1036,10 +802,10 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
*
* *Only* a frozen-for-read tape can be seeked.
*
- * Return value is TRUE if seek successful, FALSE if there isn't that much
- * data in the tape (in which case there's no state change).
+ * Must be called with a block/offset previously returned by
+ * LogicalTapeTell().
*/
-bool
+void
LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
long blocknum, int offset)
{
@@ -1048,53 +814,20 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
Assert(tapenum >= 0 && tapenum < lts->nTapes);
lt = &lts->tapes[tapenum];
Assert(lt->frozen);
- Assert(offset >= 0 && offset <= BLCKSZ);
+ Assert(offset >= 0 && offset <= TapeBlockPayloadSize);
Assert(lt->buffer_size == BLCKSZ);
- /*
- * Easy case for seek within current block.
- */
- if (blocknum == lt->curBlockNumber && offset <= lt->nbytes)
+ if (blocknum != lt->curBlockNumber)
{
- lt->pos = offset;
- return true;
+ ltsReadBlock(lts, blocknum, (void *) lt->buffer);
+ lt->curBlockNumber = blocknum;
+ lt->nbytes = TapeBlockPayloadSize;
+ lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
}
- /*
- * Not-so-easy case. Figure out whether it's possible at all.
- */
- if (blocknum < 0 || blocknum > lt->numFullBlocks ||
- (blocknum == lt->numFullBlocks && offset > lt->lastBlockBytes))
- return false;
-
- /*
- * OK, advance or back up to the target block. This implementation would
- * be pretty inefficient for long seeks, but we really aren't expecting
- * that (a seek over one tuple is typical).
- */
- while (lt->curBlockNumber > blocknum)
- {
- long datablocknum = ltsRecallPrevBlockNum(lts, lt->indirect);
-
- if (datablocknum == -1L)
- elog(ERROR, "unexpected end of tape");
- if (--lt->curBlockNumber == blocknum)
- ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
- }
- while (lt->curBlockNumber < blocknum)
- {
- long datablocknum = ltsRecallNextBlockNum(lts, lt->indirect,
- lt->frozen);
-
- if (datablocknum == -1L)
- elog(ERROR, "unexpected end of tape");
- if (++lt->curBlockNumber == blocknum)
- ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
- }
- lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
- BLCKSZ : lt->lastBlockBytes;
+ if (offset > lt->nbytes)
+ elog(ERROR, "invalid tape seek position");
lt->pos = offset;
- return true;
}
/*
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 71524c267f5..2a62febd599 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -240,16 +240,16 @@ typedef enum
* Parameters for calculation of number of tapes to use --- see inittapes()
* and tuplesort_merge_order().
*
- * In this calculation we assume that each tape will cost us about 3 blocks
- * worth of buffer space (which is an underestimate for very large data
- * volumes, but it's probably close enough --- see logtape.c).
+ * In this calculation we assume that each tape will cost us about 1 blocks
+ * worth of buffer space. This ignores the overhead of all the other data
+ * structures needed for each tape, but it's probably close enough.
*
* MERGE_BUFFER_SIZE is how much data we'd like to read from each input
* tape during a preread cycle (see discussion at top of file).
*/
#define MINORDER 6 /* minimum merge order */
#define MAXORDER 500 /* maximum merge order */
-#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3)
+#define TAPE_BUFFER_OVERHEAD BLCKSZ
#define MERGE_BUFFER_SIZE (BLCKSZ * 32)
/*
@@ -1849,6 +1849,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
SortTuple *stup)
{
unsigned int tuplen;
+ size_t nmoved;
switch (state->status)
{
@@ -1948,10 +1949,13 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
* end of file; back up to fetch last tuple's ending length
* word. If seek fails we must have a completely empty file.
*/
- if (!LogicalTapeBackspace(state->tapeset,
- state->result_tape,
- 2 * sizeof(unsigned int)))
+ nmoved = LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ 2 * sizeof(unsigned int));
+ if (nmoved == 0)
return false;
+ else if (nmoved != 2 * sizeof(unsigned int))
+ elog(ERROR, "unexpected tape position");
state->eof_reached = false;
}
else
@@ -1960,31 +1964,34 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
* Back up and fetch previously-returned tuple's ending length
* word. If seek fails, assume we are at start of file.
*/
- if (!LogicalTapeBackspace(state->tapeset,
- state->result_tape,
- sizeof(unsigned int)))
+ nmoved = LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ sizeof(unsigned int));
+ if (nmoved == 0)
return false;
+ else if (nmoved != sizeof(unsigned int))
+ elog(ERROR, "unexpected tape position");
tuplen = getlen(state, state->result_tape, false);
/*
* Back up to get ending length word of tuple before it.
*/
- if (!LogicalTapeBackspace(state->tapeset,
- state->result_tape,
- tuplen + 2 * sizeof(unsigned int)))
+ nmoved = LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ tuplen + 2 * sizeof(unsigned int));
+ if (nmoved == tuplen + sizeof(unsigned int))
{
/*
- * If that fails, presumably the prev tuple is the first
- * in the file. Back up so that it becomes next to read
- * in forward direction (not obviously right, but that is
- * what in-memory case does).
+ * We backed up over the previous tuple, but there was no
+ * ending length word before it. That means that the prev
+ * tuple is the first tuple in the file. It is now the
+ * next to read in forward direction (not obviously right,
+ * but that is what in-memory case does).
*/
- if (!LogicalTapeBackspace(state->tapeset,
- state->result_tape,
- tuplen + sizeof(unsigned int)))
- elog(ERROR, "bogus tuple length in backward scan");
return false;
}
+ else if (nmoved != tuplen + 2 * sizeof(unsigned int))
+ elog(ERROR, "bogus tuple length in backward scan");
}
tuplen = getlen(state, state->result_tape, false);
@@ -1994,9 +2001,10 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
* Note: READTUP expects we are positioned after the initial
* length word of the tuple, so back up to that point.
*/
- if (!LogicalTapeBackspace(state->tapeset,
- state->result_tape,
- tuplen))
+ nmoved = LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ tuplen);
+ if (nmoved != tuplen)
elog(ERROR, "bogus tuple length in backward scan");
READTUP(state, stup, state->result_tape, tuplen);
@@ -3183,11 +3191,10 @@ tuplesort_restorepos(Tuplesortstate *state)
state->eof_reached = state->markpos_eof;
break;
case TSS_SORTEDONTAPE:
- if (!LogicalTapeSeek(state->tapeset,
- state->result_tape,
- state->markpos_block,
- state->markpos_offset))
- elog(ERROR, "tuplesort_restorepos failed");
+ LogicalTapeSeek(state->tapeset,
+ state->result_tape,
+ state->markpos_block,
+ state->markpos_offset);
state->eof_reached = state->markpos_eof;
break;
default:
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h
index d7dccb85be4..f580aade53e 100644
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -35,9 +35,9 @@ extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum,
size_t buffer_size);
extern void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum);
extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum);
-extern bool LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
+extern size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
size_t size);
-extern bool LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
+extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
long blocknum, int offset);
extern void LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
long *blocknum, int *offset);