diff options
Diffstat (limited to 'src/backend/access/gist')
-rw-r--r-- | src/backend/access/gist/gistbuild.c | 510 | ||||
-rw-r--r-- | src/backend/access/gist/gistproc.c | 229 | ||||
-rw-r--r-- | src/backend/access/gist/gistutil.c | 53 | ||||
-rw-r--r-- | src/backend/access/gist/gistvalidate.c | 6 |
4 files changed, 696 insertions, 102 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 671b5e9186f..230625cf1e2 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -3,6 +3,24 @@ * gistbuild.c * build algorithm for GiST indexes implementation. * + * There are two different strategies: + * + * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted + * order, and create downlinks and internal pages as we go. This builds + * the index from the bottom up, similar to how B-tree index build + * works. + * + * 2. Start with an empty index, and insert all tuples one by one. + * + * The sorted method is used if the operator classes for all columns have + * a 'sortsupport' defined. Otherwise, we resort to the second strategy. + * + * The second strategy can optionally use buffers at different levels of + * the tree to reduce I/O, see "Buffering build algorithm" in the README + * for a more detailed explanation. It initially calls insert over and + * over, but switches to the buffered algorithm after a certain number of + * tuples (unless buffering mode is disabled). + * * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -28,6 +46,7 @@ #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/tuplesort.h" /* Step of index tuples for check whether to switch to buffering build mode */ #define BUFFERING_MODE_SWITCH_CHECK_STEP 256 @@ -40,8 +59,14 @@ */ #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096 +/* + * Strategy used to build the index. It can change between the + * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used, + * that needs to be decided up-front and cannot be changed afterwards. + */ typedef enum { + GIST_SORTED_BUILD, /* bottom-up build by sorting */ GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to * switch */ GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to @@ -51,7 +76,7 @@ typedef enum * before switching to the buffering build * mode */ GIST_BUFFERING_ACTIVE /* in buffering build mode */ -} GistBufferingMode; +} GistBuildMode; /* Working state for gistbuild and its callback */ typedef struct @@ -60,23 +85,58 @@ typedef struct Relation heaprel; GISTSTATE *giststate; - int64 indtuples; /* number of tuples indexed */ - int64 indtuplesSize; /* total size of all indexed tuples */ - Size freespace; /* amount of free space to leave on pages */ + GistBuildMode buildMode; + + int64 indtuples; /* number of tuples indexed */ + /* * Extra data structures used during a buffering build. 'gfbb' contains * information related to managing the build buffers. 'parentMap' is a * lookup table of the parent of each internal page. */ + int64 indtuplesSize; /* total size of all indexed tuples */ GISTBuildBuffers *gfbb; HTAB *parentMap; - GistBufferingMode bufferingMode; + /* + * Extra data structures used during a sorting build. + */ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + + BlockNumber pages_allocated; + BlockNumber pages_written; + + int ready_num_pages; + BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; + Page ready_pages[XLR_MAX_BLOCK_ID]; } GISTBuildState; +/* + * In sorted build, we use a stack of these structs, one for each level, + * to hold an in-memory buffer of the righmost page at the level. When the + * page fills up, it is written out and a new page is allocated. + */ +typedef struct GistSortedBuildPageState +{ + Page page; + struct GistSortedBuildPageState *parent; /* Upper level, if any */ +} GistSortedBuildPageState; + /* prototypes for private functions */ + +static void gistSortedBuildCallback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *state); +static void gist_indexsortbuild(GISTBuildState *state); +static void gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup); +static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate); +static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); + static void gistInitBuffering(GISTBuildState *buildstate); static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); static void gistBuildCallback(Relation index, @@ -107,10 +167,9 @@ static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent); static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); + /* - * Main entry point to GiST index build. Initially calls insert over and over, - * but switches to more efficient buffering build algorithm after a certain - * number of tuples (unless buffering mode is disabled). + * Main entry point to GiST index build. */ IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) @@ -118,124 +177,407 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) IndexBuildResult *result; double reltuples; GISTBuildState buildstate; - Buffer buffer; - Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; + Oid SortSupportFnOids[INDEX_MAX_KEYS]; + bool hasallsortsupports; + int keyscount = IndexRelationGetNumberOfKeyAttributes(index); + GiSTOptions *options = NULL; + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + if (index->rd_options) + options = (GiSTOptions *) index->rd_options; buildstate.indexrel = index; buildstate.heaprel = heap; + buildstate.sortstate = NULL; + buildstate.giststate = initGISTstate(index); - if (index->rd_options) + /* + * Create a temporary memory context that is reset once for each tuple + * processed. (Note: we don't bother to make this a child of the + * giststate's scanCxt, so we have to delete it separately at the end.) + */ + buildstate.giststate->tempCxt = createTempGistContext(); + + /* + * Choose build strategy. If all keys support sorting, do that. Otherwise + * the default strategy is switch to buffering mode when the index grows + * too large to fit in cache. + */ + hasallsortsupports = true; + for (int i = 0; i < keyscount; i++) { - /* Get buffering mode from the options string */ - GiSTOptions *options = (GiSTOptions *) index->rd_options; + SortSupportFnOids[i] = index_getprocid(index, i + 1, + GIST_SORTSUPPORT_PROC); + if (!OidIsValid(SortSupportFnOids[i])) + { + hasallsortsupports = false; + break; + } + } + if (hasallsortsupports) + { + buildstate.buildMode = GIST_SORTED_BUILD; + } + else if (options) + { if (options->buffering_mode == GIST_OPTION_BUFFERING_ON) - buildstate.bufferingMode = GIST_BUFFERING_STATS; + buildstate.buildMode = GIST_BUFFERING_STATS; else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF) - buildstate.bufferingMode = GIST_BUFFERING_DISABLED; + buildstate.buildMode = GIST_BUFFERING_DISABLED; else - buildstate.bufferingMode = GIST_BUFFERING_AUTO; - - fillfactor = options->fillfactor; + buildstate.buildMode = GIST_BUFFERING_AUTO; } else { - /* - * By default, switch to buffering mode when the index grows too large - * to fit in cache. - */ - buildstate.bufferingMode = GIST_BUFFERING_AUTO; - fillfactor = GIST_DEFAULT_FILLFACTOR; + buildstate.buildMode = GIST_BUFFERING_AUTO; } - /* Calculate target amount of free space to leave on pages */ + + /* + * Calculate target amount of free space to leave on pages. + */ + fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR; buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* - * We expect to be called exactly once for any index relation. If that's - * not the case, big trouble's what we have. + * Build the index using the chosen strategy. */ - if (RelationGetNumberOfBlocks(index) != 0) - elog(ERROR, "index \"%s\" already contains data", - RelationGetRelationName(index)); + buildstate.indtuples = 0; + buildstate.indtuplesSize = 0; - /* no locking is needed */ - buildstate.giststate = initGISTstate(index); + if (buildstate.buildMode == GIST_SORTED_BUILD) + { + /* + * Sort all data, build the index from bottom up. + */ + buildstate.sortstate = tuplesort_begin_index_gist(heap, + index, + maintenance_work_mem, + NULL, + false); + + /* Scan the table, adding all tuples to the tuplesort */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistSortedBuildCallback, + (void *) &buildstate, NULL); + + /* + * Perform the sort and build index pages. + */ + tuplesort_performsort(buildstate.sortstate); + + gist_indexsortbuild(&buildstate); + + tuplesort_end(buildstate.sortstate); + } + else + { + /* + * Initialize an empty index and insert all tuples, possibly using + * buffers on intermediate levels. + */ + Buffer buffer; + Page page; + + /* initialize the root page */ + buffer = gistNewBuffer(index); + Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); + page = BufferGetPage(buffer); + + START_CRIT_SECTION(); + + GISTInitBuffer(buffer, F_LEAF); + + MarkBufferDirty(buffer); + PageSetLSN(page, GistBuildLSN); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* Scan the table, inserting all the tuples to the index. */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistBuildCallback, + (void *) &buildstate, NULL); + + /* + * If buffering was used, flush out all the tuples that are still in + * the buffers. + */ + if (buildstate.buildMode == GIST_BUFFERING_ACTIVE) + { + elog(DEBUG1, "all tuples processed, emptying buffers"); + gistEmptyAllBuffers(&buildstate); + gistFreeBuildBuffers(buildstate.gfbb); + } + + /* + * We didn't write WAL records as we built the index, so if + * WAL-logging is required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } + } + + /* okay, all heap tuples are indexed */ + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(buildstate.giststate->tempCxt); + + freeGISTstate(buildstate.giststate); /* - * Create a temporary memory context that is reset once for each tuple - * processed. (Note: we don't bother to make this a child of the - * giststate's scanCxt, so we have to delete it separately at the end.) + * Return statistics */ - buildstate.giststate->tempCxt = createTempGistContext(); + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); - /* initialize the root page */ - buffer = gistNewBuffer(index); - Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); - page = BufferGetPage(buffer); + result->heap_tuples = reltuples; + result->index_tuples = (double) buildstate.indtuples; - START_CRIT_SECTION(); + return result; +} - GISTInitBuffer(buffer, F_LEAF); +/*------------------------------------------------------------------------- + * Routines for sorted build + *------------------------------------------------------------------------- + */ - MarkBufferDirty(buffer); - PageSetLSN(page, GistBuildLSN); +/* + * Per-tuple callback for table_index_build_scan. + */ +static void +gistSortedBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + MemoryContext oldCtx; + Datum compressed_values[INDEX_MAX_KEYS]; - UnlockReleaseBuffer(buffer); + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); - END_CRIT_SECTION(); + /* Form an index tuple and point it at the heap tuple */ + gistCompressValues(buildstate->giststate, index, + values, isnull, + true, compressed_values); - /* build the index */ - buildstate.indtuples = 0; - buildstate.indtuplesSize = 0; + tuplesort_putindextuplevalues(buildstate->sortstate, + buildstate->indexrel, + tid, + compressed_values, isnull); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->giststate->tempCxt); + + /* Update tuple count. */ + buildstate->indtuples += 1; +} + +/* + * Build GiST index from bottom up from pre-sorted tuples. + */ +static void +gist_indexsortbuild(GISTBuildState *state) +{ + IndexTuple itup; + GistSortedBuildPageState *leafstate; + GistSortedBuildPageState *pagestate; + Page page; + + state->pages_allocated = 0; + state->pages_written = 0; + state->ready_num_pages = 0; /* - * Do the heap scan. + * Write an empty page as a placeholder for the root page. It will be + * replaced with the real root page at the end. */ - reltuples = table_index_build_scan(heap, index, indexInfo, true, true, - gistBuildCallback, - (void *) &buildstate, NULL); + page = palloc0(BLCKSZ); + smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + page, true); + state->pages_allocated++; + state->pages_written++; + + /* Allocate a temporary buffer for the first leaf page. */ + leafstate = palloc(sizeof(GistSortedBuildPageState)); + leafstate->page = page; + leafstate->parent = NULL; + gistinitpage(page, F_LEAF); /* - * If buffering was used, flush out all the tuples that are still in the - * buffers. + * Fill index pages with tuples in the sorted order. */ - if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) + while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL) { - elog(DEBUG1, "all tuples processed, emptying buffers"); - gistEmptyAllBuffers(&buildstate); - gistFreeBuildBuffers(buildstate.gfbb); + gist_indexsortbuild_pagestate_add(state, leafstate, itup); + MemoryContextReset(state->giststate->tempCxt); } - /* okay, all heap tuples are indexed */ - MemoryContextSwitchTo(oldcxt); - MemoryContextDelete(buildstate.giststate->tempCxt); - - freeGISTstate(buildstate.giststate); - /* - * We didn't write WAL records as we built the index, so if WAL-logging is - * required, write all pages to the WAL now. + * Write out the partially full non-root pages. + * + * Keep in mind that flush can build a new root. */ - if (RelationNeedsWAL(index)) + pagestate = leafstate; + while (pagestate->parent != NULL) { - log_newpage_range(index, MAIN_FORKNUM, - 0, RelationGetNumberOfBlocks(index), - true); + GistSortedBuildPageState *parent; + + gist_indexsortbuild_pagestate_flush(state, pagestate); + parent = pagestate->parent; + pfree(pagestate->page); + pfree(pagestate); + pagestate = parent; } + gist_indexsortbuild_flush_ready_pages(state); + + /* Write out the root */ + PageSetLSN(pagestate->page, GistBuildLSN); + smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + if (RelationNeedsWAL(state->indexrel)) + log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + + pfree(pagestate->page); + pfree(pagestate); +} + +/* + * Add tuple to a page. If the pages is full, write it out and re-initialize + * a new page first. + */ +static void +gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup) +{ + Size sizeNeeded; + + /* Does the tuple fit? If not, flush */ + sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace; + if (PageGetFreeSpace(pagestate->page) < sizeNeeded) + gist_indexsortbuild_pagestate_flush(state, pagestate); + + gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber); +} + +static void +gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate) +{ + GistSortedBuildPageState *parent; + IndexTuple *itvec; + IndexTuple union_tuple; + int vect_len; + bool isleaf; + BlockNumber blkno; + MemoryContext oldCtx; + + /* check once per page */ + CHECK_FOR_INTERRUPTS(); + + if (state->ready_num_pages == XLR_MAX_BLOCK_ID) + gist_indexsortbuild_flush_ready_pages(state); + /* - * Return statistics + * The page is now complete. Assign a block number to it, and add it to + * the list of finished pages. (We don't write it out immediately, because + * we want to WAL-log the pages in batches.) */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + blkno = state->pages_allocated++; + state->ready_blknos[state->ready_num_pages] = blkno; + state->ready_pages[state->ready_num_pages] = pagestate->page; + state->ready_num_pages++; - result->heap_tuples = reltuples; - result->index_tuples = (double) buildstate.indtuples; + isleaf = GistPageIsLeaf(pagestate->page); - return result; + /* + * Form a downlink tuple to represent all the tuples on the page. + */ + oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt); + itvec = gistextractpage(pagestate->page, &vect_len); + union_tuple = gistunion(state->indexrel, itvec, vect_len, + state->giststate); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); + MemoryContextSwitchTo(oldCtx); + + /* + * Insert the downlink to the parent page. If this was the root, create a + * new page as the parent, which becomes the new root. + */ + parent = pagestate->parent; + if (parent == NULL) + { + parent = palloc(sizeof(GistSortedBuildPageState)); + parent->page = (Page) palloc(BLCKSZ); + parent->parent = NULL; + gistinitpage(parent->page, 0); + + pagestate->parent = parent; + } + gist_indexsortbuild_pagestate_add(state, parent, union_tuple); + + /* Re-initialize the page buffer for next page on this level. */ + pagestate->page = palloc(BLCKSZ); + gistinitpage(pagestate->page, isleaf ? F_LEAF : 0); +} + +static void +gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) +{ + if (state->ready_num_pages == 0) + return; + + for (int i = 0; i < state->ready_num_pages; i++) + { + Page page = state->ready_pages[i]; + + /* Currently, the blocks must be buffered in order. */ + if (state->ready_blknos[i] != state->pages_written) + elog(ERROR, "unexpected block number to flush GiST sorting build"); + + PageSetLSN(page, GistBuildLSN); + + smgrextend(state->indexrel->rd_smgr, + MAIN_FORKNUM, + state->pages_written++, + page, + true); + } + + if (RelationNeedsWAL(state->indexrel)) + log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages, + state->ready_blknos, state->ready_pages, true); + + for (int i = 0; i < state->ready_num_pages; i++) + pfree(state->ready_pages[i]); + + state->ready_num_pages = 0; } + +/*------------------------------------------------------------------------- + * Routines for non-sorted build + *------------------------------------------------------------------------- + */ + /* * Attempt to switch to buffering mode. * @@ -375,7 +717,7 @@ gistInitBuffering(GISTBuildState *buildstate) if (levelStep <= 0) { elog(DEBUG1, "failed to switch to buffered GiST build"); - buildstate->bufferingMode = GIST_BUFFERING_DISABLED; + buildstate->buildMode = GIST_BUFFERING_DISABLED; return; } @@ -392,7 +734,7 @@ gistInitBuffering(GISTBuildState *buildstate) gistInitParentMap(buildstate); - buildstate->bufferingMode = GIST_BUFFERING_ACTIVE; + buildstate->buildMode = GIST_BUFFERING_ACTIVE; elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d", levelStep, pagesPerBuffer); @@ -453,10 +795,12 @@ gistBuildCallback(Relation index, oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); /* form an index tuple and point it at the heap tuple */ - itup = gistFormTuple(buildstate->giststate, index, values, isnull, true); + itup = gistFormTuple(buildstate->giststate, index, + values, isnull, + true); itup->t_tid = *tid; - if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE) + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE) { /* We have buffers, so use them. */ gistBufferingBuildInsert(buildstate, itup); @@ -478,7 +822,7 @@ gistBuildCallback(Relation index, MemoryContextSwitchTo(oldCtx); MemoryContextReset(buildstate->giststate->tempCxt); - if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE && + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE && buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0) { /* Adjust the target buffer size now */ @@ -493,10 +837,10 @@ gistBuildCallback(Relation index, * To avoid excessive calls to smgrnblocks(), only check this every * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples */ - if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO && + if ((buildstate->buildMode == GIST_BUFFERING_AUTO && buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) || - (buildstate->bufferingMode == GIST_BUFFERING_STATS && + (buildstate->buildMode == GIST_BUFFERING_STATS && buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) { /* diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index 9ace64c3c4a..27d9c0f77c3 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -24,6 +24,7 @@ #include "utils/builtins.h" #include "utils/float.h" #include "utils/geo_decls.h" +#include "utils/sortsupport.h" static bool gist_box_leaf_consistent(BOX *key, BOX *query, @@ -31,6 +32,15 @@ static bool gist_box_leaf_consistent(BOX *key, BOX *query, static bool rtree_internal_consistent(BOX *key, BOX *query, StrategyNumber strategy); +static uint64 point_zorder_internal(float4 x, float4 y); +static uint64 part_bits32_by2(uint32 x); +static uint32 ieee_float32_to_uint32(float f); +static int gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup); +static Datum gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup); +static int gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup); +static bool gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup); + + /* Minimum accepted ratio of split */ #define LIMIT_RATIO 0.3 @@ -1540,3 +1550,222 @@ gist_poly_distance(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(distance); } + +/* + * Z-order routines for fast index build + */ + +/* + * Compute Z-value of a point + * + * Z-order (also known as Morton Code) maps a two-dimensional point to a + * single integer, in a way that preserves locality. Points that are close in + * the two-dimensional space are mapped to integer that are not far from each + * other. We do that by interleaving the bits in the X and Y components. + * + * Morton Code is normally defined only for integers, but the X and Y values + * of a point are floating point. We expect floats to be in IEEE format. + */ +static uint64 +point_zorder_internal(float4 x, float4 y) +{ + uint32 ix = ieee_float32_to_uint32(x); + uint32 iy = ieee_float32_to_uint32(y); + + /* Interleave the bits */ + return part_bits32_by2(ix) | (part_bits32_by2(iy) << 1); +} + +/* Interleave 32 bits with zeroes */ +static uint64 +part_bits32_by2(uint32 x) +{ + uint64 n = x; + + n = (n | (n << 16)) & UINT64CONST(0x0000FFFF0000FFFF); + n = (n | (n << 8)) & UINT64CONST(0x00FF00FF00FF00FF); + n = (n | (n << 4)) & UINT64CONST(0x0F0F0F0F0F0F0F0F); + n = (n | (n << 2)) & UINT64CONST(0x3333333333333333); + n = (n | (n << 1)) & UINT64CONST(0x5555555555555555); + + return n; +} + +/* + * Convert a 32-bit IEEE float to uint32 in a way that preserves the ordering + */ +static uint32 +ieee_float32_to_uint32(float f) +{ + /*---- + * + * IEEE 754 floating point format + * ------------------------------ + * + * IEEE 754 floating point numbers have this format: + * + * exponent (8 bits) + * | + * s eeeeeeee mmmmmmmmmmmmmmmmmmmmmmm + * | | + * sign mantissa (23 bits) + * + * Infinity has all bits in the exponent set and the mantissa is all + * zeros. Negative infinity is the same but with the sign bit set. + * + * NaNs are represented with all bits in the exponent set, and the least + * significant bit in the mantissa also set. The rest of the mantissa bits + * can be used to distinguish different kinds of NaNs. + * + * The IEEE format has the nice property that when you take the bit + * representation and interpret it as an integer, the order is preserved, + * except for the sign. That holds for the +-Infinity values too. + * + * Mapping to uint32 + * ----------------- + * + * In order to have a smooth transition from negative to positive numbers, + * we map floats to unsigned integers like this: + * + * x < 0 to range 0-7FFFFFFF + * x = 0 to value 8000000 (both positive and negative zero) + * x > 0 to range 8000001-FFFFFFFF + * + * We don't care to distinguish different kind of NaNs, so they are all + * mapped to the same arbitrary value, FFFFFFFF. Because of the IEEE bit + * representation of NaNs, there aren't any non-NaN values that would be + * mapped to FFFFFFFF. In fact, there is a range of unused values on both + * ends of the uint32 space. + */ + if (isnan(f)) + return 0xFFFFFFFF; + else + { + union + { + float f; + uint32 i; + } u; + + u.f = f; + + /* Check the sign bit */ + if ((u.i & 0x80000000) != 0) + { + /* + * Map the negative value to range 0-7FFFFFFF. This flips the sign + * bit to 0 in the same instruction. + */ + Assert(f <= 0); /* can be -0 */ + u.i ^= 0xFFFFFFFF; + } + else + { + /* Map the positive value (or 0) to range 80000000-FFFFFFFF */ + u.i |= 0x80000000; + } + + return u.i; + } +} + +/* + * Compare the Z-order of points + */ +static int +gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup) +{ + Point *p1 = &(DatumGetBoxP(a)->low); + Point *p2 = &(DatumGetBoxP(b)->low); + uint64 z1; + uint64 z2; + + /* + * Do a quick check for equality first. It's not clear if this is worth it + * in general, but certainly is when used as tie-breaker with abbreviated + * keys, + */ + if (p1->x == p2->x && p1->y == p2->y) + return 0; + + z1 = point_zorder_internal(p1->x, p1->y); + z2 = point_zorder_internal(p2->x, p2->y); + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * Abbreviated version of Z-order comparison + * + * The abbreviated format is a Z-order value computed from the two 32-bit + * floats. If SIZEOF_DATUM == 8, the 64-bit Z-order value fits fully in the + * abbreviated Datum, otherwise use its most significant bits. + */ +static Datum +gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup) +{ + Point *p = &(DatumGetBoxP(original)->low); + uint64 z; + + z = point_zorder_internal(p->x, p->y); + +#if SIZEOF_DATUM == 8 + return (Datum) z; +#else + return (Datum) (z >> 32); +#endif +} + +static int +gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup) +{ + /* + * Compare the pre-computed Z-orders as unsigned integers. Datum is a + * typedef for 'uintptr_t', so no casting is required. + */ + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * We never consider aborting the abbreviation. + * + * On 64-bit systems, the abbreviation is not lossy so it is always + * worthwhile. (Perhaps it's not on 32-bit systems, but we don't bother + * with logic to decide.) + */ +static bool +gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup) +{ + return false; +} + +/* + * Sort support routine for fast GiST index build by sorting. + */ +Datum +gist_point_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + if (ssup->abbreviate) + { + ssup->comparator = gist_bbox_zorder_cmp_abbrev; + ssup->abbrev_converter = gist_bbox_zorder_abbrev_convert; + ssup->abbrev_abort = gist_bbox_zorder_abbrev_abort; + ssup->abbrev_full_comparator = gist_bbox_zorder_cmp; + } + else + { + ssup->comparator = gist_bbox_zorder_cmp; + } + PG_RETURN_VOID(); +} diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 0516059e3dd..615b5ade233 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -572,12 +572,31 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, - Datum attdata[], bool isnull[], bool isleaf) + Datum *attdata, bool *isnull, bool isleaf) { Datum compatt[INDEX_MAX_KEYS]; - int i; IndexTuple res; + gistCompressValues(giststate, r, attdata, isnull, isleaf, compatt); + + res = index_form_tuple(isleaf ? giststate->leafTupdesc : + giststate->nonLeafTupdesc, + compatt, isnull); + + /* + * The offset number on tuples on internal pages is unused. For historical + * reasons, it is set to 0xffff. + */ + ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); + return res; +} + +void +gistCompressValues(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf, Datum *compatt) +{ + int i; + /* * Call the compress method on each attribute. */ @@ -617,17 +636,6 @@ gistFormTuple(GISTSTATE *giststate, Relation r, compatt[i] = attdata[i]; } } - - res = index_form_tuple(isleaf ? giststate->leafTupdesc : - giststate->nonLeafTupdesc, - compatt, isnull); - - /* - * The offset number on tuples on internal pages is unused. For historical - * reasons, it is set to 0xffff. - */ - ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); - return res; } /* @@ -745,14 +753,11 @@ gistpenalty(GISTSTATE *giststate, int attno, * Initialize a new index page */ void -GISTInitBuffer(Buffer b, uint32 f) +gistinitpage(Page page, uint32 f) { GISTPageOpaque opaque; - Page page; - Size pageSize; + Size pageSize = BLCKSZ; - pageSize = BufferGetPageSize(b); - page = BufferGetPage(b); PageInit(page, pageSize, sizeof(GISTPageOpaqueData)); opaque = GistPageGetOpaque(page); @@ -764,6 +769,18 @@ GISTInitBuffer(Buffer b, uint32 f) } /* + * Initialize a new index buffer + */ +void +GISTInitBuffer(Buffer b, uint32 f) +{ + Page page; + + page = BufferGetPage(b); + gistinitpage(page, f); +} + +/* * Verify that a freshly-read page looks sane. */ void diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index 2b9ab693be1..8a14620fab2 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -143,6 +143,10 @@ gistvalidate(Oid opclassoid) case GIST_OPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case GIST_SORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -263,7 +267,7 @@ gistvalidate(Oid opclassoid) continue; /* got it */ if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || - i == GIST_OPTIONS_PROC) + i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC) continue; /* optional methods */ ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |