diff options
Diffstat (limited to 'src')
25 files changed, 1871 insertions, 147 deletions
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index b926689c5cb..880f2db5266 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/common/reloptions.c,v 1.23 2009/03/23 16:36:27 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/common/reloptions.c,v 1.24 2009/03/24 20:17:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -56,6 +56,14 @@ static relopt_bool boolRelOpts[] = }, true }, + { + { + "fastupdate", + "Enables \"fast update\" feature for this GIN index", + RELOPT_KIND_GIN + }, + true + }, /* list terminator */ { { NULL } } }; diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile index 08946c88a73..23b75fc1d80 100644 --- a/src/backend/access/gin/Makefile +++ b/src/backend/access/gin/Makefile @@ -4,7 +4,7 @@ # Makefile for access/gin # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/gin/Makefile,v 1.3 2008/02/19 10:30:06 petere Exp $ +# $PostgreSQL: pgsql/src/backend/access/gin/Makefile,v 1.4 2009/03/24 20:17:10 tgl Exp $ # #------------------------------------------------------------------------- @@ -14,6 +14,6 @@ include $(top_builddir)/src/Makefile.global OBJS = ginutil.o gininsert.o ginxlog.o ginentrypage.o gindatapage.o \ ginbtree.o ginscan.o ginget.o ginvacuum.o ginarrayproc.o \ - ginbulk.o + ginbulk.o ginfast.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index 136f80d9977..a7258619aee 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.14 2009/01/01 17:23:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.15 2009/03/24 20:17:10 tgl Exp $ *------------------------------------------------------------------------- */ @@ -197,6 +197,8 @@ ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber att if (nentry <= 0) return; + Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber); + i = nentry - 1; for (; i > 0; i >>= 1) nbit++; diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index d0e426c6560..a872d44880c 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/gindatapage.c,v 1.13 2009/01/01 17:23:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/gindatapage.c,v 1.14 2009/03/24 20:17:10 tgl Exp $ *------------------------------------------------------------------------- */ @@ -43,8 +43,16 @@ MergeItemPointers(ItemPointerData *dst, ItemPointerData *a, uint32 na, ItemPoint while (aptr - a < na && bptr - b < nb) { - if (compareItemPointers(aptr, bptr) > 0) + int cmp = compareItemPointers(aptr, bptr); + + if (cmp > 0) + *dptr++ = *bptr++; + else if (cmp == 0) + { + /* we want only one copy of the identical items */ *dptr++ = *bptr++; + aptr++; + } else *dptr++ = *aptr++; } @@ -630,11 +638,16 @@ insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem) gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack); if (gdi->btree.findItem(&(gdi->btree), gdi->stack)) - elog(ERROR, "item pointer (%u,%d) already exists", - ItemPointerGetBlockNumber(gdi->btree.items + gdi->btree.curitem), - ItemPointerGetOffsetNumber(gdi->btree.items + gdi->btree.curitem)); - - ginInsertValue(&(gdi->btree), gdi->stack); + { + /* + * gdi->btree.items[gdi->btree.curitem] already exists in index + */ + gdi->btree.curitem++; + LockBuffer(gdi->stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(gdi->stack); + } + else + ginInsertValue(&(gdi->btree), gdi->stack); gdi->stack = NULL; } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c new file mode 100644 index 00000000000..d8624237ec1 --- /dev/null +++ b/src/backend/access/gin/ginfast.c @@ -0,0 +1,866 @@ +/*------------------------------------------------------------------------- + * + * ginfast.c + * Fast insert routines for the Postgres inverted index access method. + * Pending entries are stored in linear list of pages. Later on + * (typically during VACUUM), ginInsertCleanup() will be invoked to + * transfer pending entries into the regular index structure. This + * wins because bulk insertion is much more efficient than retail. + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/access/gin/ginfast.c,v 1.1 2009/03/24 20:17:10 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/gin.h" +#include "access/tuptoaster.h" +#include "catalog/index.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" + + +#define GIN_PAGE_FREESIZE \ + ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +typedef struct DatumArray +{ + Datum *values; /* expansible array */ + int32 nvalues; /* current number of valid entries */ + int32 maxvalues; /* allocated size of array */ +} DatumArray; + + +/* + * Build a pending-list page from the given array of tuples, and write it out. + */ +static int32 +writeListPage(Relation index, Buffer buffer, + IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) +{ + Page page = BufferGetPage(buffer); + int i, freesize, size=0; + OffsetNumber l, off; + char *workspace; + char *ptr; + + /* workspace could be a local array; we use palloc for alignment */ + workspace = palloc(BLCKSZ); + + START_CRIT_SECTION(); + + GinInitBuffer(buffer, GIN_LIST); + + off = FirstOffsetNumber; + ptr = workspace; + + for(i=0; i<ntuples; i++) + { + int this_size = IndexTupleSize(tuples[i]); + + memcpy(ptr, tuples[i], this_size); + ptr += this_size; + size += this_size; + + l = PageAddItem(page, (Item)tuples[i], this_size, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + off++; + } + + Assert(size <= BLCKSZ); /* else we overran workspace */ + + GinPageGetOpaque(page)->rightlink = rightlink; + + /* + * tail page may contain only the whole row(s) or final + * part of row placed on previous pages + */ + if ( rightlink == InvalidBlockNumber ) + { + GinPageSetFullRow(page); + GinPageGetOpaque(page)->maxoff = 1; + } + else + { + GinPageGetOpaque(page)->maxoff = 0; + } + + freesize = PageGetFreeSpace(page); + + MarkBufferDirty(buffer); + + if (!index->rd_istemp) + { + XLogRecData rdata[2]; + ginxlogInsertListPage data; + XLogRecPtr recptr; + + rdata[0].buffer = buffer; + rdata[0].buffer_std = true; + rdata[0].data = (char*)&data; + rdata[0].len = sizeof(ginxlogInsertListPage); + rdata[0].next = rdata+1; + + rdata[1].buffer = InvalidBuffer; + rdata[1].data = workspace; + rdata[1].len = size; + rdata[1].next = NULL; + + data.blkno = BufferGetBlockNumber(buffer); + data.rightlink = rightlink; + data.ntuples = ntuples; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE, rdata); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + pfree(workspace); + + return freesize; +} + +static void +makeSublist(Relation index, IndexTuple *tuples, int32 ntuples, + GinMetaPageData *res) +{ + Buffer curBuffer = InvalidBuffer; + Buffer prevBuffer = InvalidBuffer; + int i, size = 0, tupsize; + int startTuple = 0; + + Assert(ntuples > 0); + + /* + * Split tuples into pages + */ + for(i=0;i<ntuples;i++) + { + if ( curBuffer == InvalidBuffer ) + { + curBuffer = GinNewBuffer(index); + + if ( prevBuffer != InvalidBuffer ) + { + res->nPendingPages++; + writeListPage(index, prevBuffer, + tuples+startTuple, i-startTuple, + BufferGetBlockNumber(curBuffer)); + } + else + { + res->head = BufferGetBlockNumber(curBuffer); + } + + prevBuffer = curBuffer; + startTuple = i; + size = 0; + } + + tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData); + + if ( size + tupsize >= GinListPageSize ) + { + /* won't fit, force a new page and reprocess */ + i--; + curBuffer = InvalidBuffer; + } + else + { + size += tupsize; + } + } + + /* + * Write last page + */ + res->tail = BufferGetBlockNumber(curBuffer); + res->tailFreeSize = writeListPage(index, curBuffer, + tuples+startTuple, ntuples-startTuple, + InvalidBlockNumber); + res->nPendingPages++; + /* that was only one heap tuple */ + res->nPendingHeapTuples = 1; +} + +/* + * Inserts collected values during normal insertion. Function guarantees + * that all values of heap will be stored sequentially, preserving order + */ +void +ginHeapTupleFastInsert(Relation index, GinState *ginstate, + GinTupleCollector *collector) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata = NULL; + XLogRecData rdata[2]; + Buffer buffer = InvalidBuffer; + Page page = NULL; + ginxlogUpdateMeta data; + bool separateList = false; + bool needCleanup = false; + + if ( collector->ntuples == 0 ) + return; + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogUpdateMeta); + rdata[0].next = NULL; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + if ( collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GIN_PAGE_FREESIZE ) + { + /* + * Total size is greater than one page => make sublist + */ + separateList = true; + } + else + { + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if ( metadata->head == InvalidBlockNumber || + collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize ) + { + /* + * Pending list is empty or total size is greater than freespace + * on tail page => make sublist + * + * We unlock metabuffer to keep high concurrency + */ + separateList = true; + LockBuffer(metabuffer, GIN_UNLOCK); + } + } + + if ( separateList ) + { + GinMetaPageData sublist; + + /* + * We should make sublist separately and append it to the tail + */ + memset( &sublist, 0, sizeof(GinMetaPageData) ); + + makeSublist(index, collector->tuples, collector->ntuples, &sublist); + + /* + * metapage was unlocked, see above + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if ( metadata->head == InvalidBlockNumber ) + { + /* + * Sublist becomes main list + */ + START_CRIT_SECTION(); + memcpy(metadata, &sublist, sizeof(GinMetaPageData) ); + memcpy(&data.metadata, &sublist, sizeof(GinMetaPageData) ); + } + else + { + /* + * merge lists + */ + + data.prevTail = metadata->tail; + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); + + START_CRIT_SECTION(); + + GinPageGetOpaque(page)->rightlink = sublist.head; + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + metadata->nPendingPages += sublist.nPendingPages; + metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; + + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData) ); + data.newRightlink = sublist.head; + + MarkBufferDirty(buffer); + } + } + else + { + /* + * Insert into tail page, metapage is already locked + */ + + OffsetNumber l, off; + int i, tupsize; + char *ptr; + + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + rdata[0].next = rdata + 1; + + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + ptr = rdata[1].data = (char *) palloc( collector->sumsize ); + rdata[1].len = collector->sumsize; + rdata[1].next = NULL; + + data.ntuples = collector->ntuples; + + START_CRIT_SECTION(); + + /* + * Increase counter of heap tuples + */ + Assert( GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples ); + GinPageGetOpaque(page)->maxoff++; + metadata->nPendingHeapTuples++; + + for(i=0; i<collector->ntuples; i++) + { + tupsize = IndexTupleSize(collector->tuples[i]); + l = PageAddItem(page, (Item)collector->tuples[i], tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + memcpy(ptr, collector->tuples[i], tupsize); + ptr+=tupsize; + + off++; + } + + metadata->tailFreeSize -= collector->sumsize + collector->ntuples * sizeof(ItemIdData); + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData) ); + MarkBufferDirty(buffer); + } + + /* + * Make real write + */ + + MarkBufferDirty(metabuffer); + if ( !index->rd_istemp ) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata); + PageSetLSN(metapage, recptr); + PageSetTLI(metapage, ThisTimeLineID); + + if ( buffer != InvalidBuffer ) + { + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + + /* + * Force pending list cleanup when it becomes too long. + * And, ginInsertCleanup could take significant amount of + * time, so we prefer to call it when it can do all the work in a + * single collection cycle. In non-vacuum mode, it shouldn't + * require maintenance_work_mem, so fire it while pending list is + * still small enough to fit into work_mem. + * + * ginInsertCleanup() should not be called inside our CRIT_SECTION. + */ + if ( metadata->nPendingPages * GIN_PAGE_FREESIZE > work_mem * 1024L ) + needCleanup = true; + + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); + + if ( needCleanup ) + ginInsertCleanup(index, ginstate, false, NULL); +} + +/* + * Collect values from one tuples to be indexed. All values for + * one tuples should be written at once - to guarantee consistent state + */ +uint32 +ginHeapTupleFastCollect(Relation index, GinState *ginstate, + GinTupleCollector *collector, + OffsetNumber attnum, Datum value, ItemPointer item) +{ + Datum *entries; + int32 i, + nentries; + + entries = extractEntriesSU(ginstate, attnum, value, &nentries); + + if (nentries == 0) + /* nothing to insert */ + return 0; + + /* + * Allocate/reallocate memory for storing collected tuples + */ + if ( collector->tuples == NULL ) + { + collector->lentuples = nentries * index->rd_att->natts; + collector->tuples = (IndexTuple*)palloc(sizeof(IndexTuple) * collector->lentuples); + } + + while ( collector->ntuples + nentries > collector->lentuples ) + { + collector->lentuples *= 2; + collector->tuples = (IndexTuple*)repalloc( collector->tuples, + sizeof(IndexTuple) * collector->lentuples); + } + + /* + * Creates tuple's array + */ + for (i = 0; i < nentries; i++) + { + int32 tupsize; + + collector->tuples[collector->ntuples + i] = GinFormTuple(ginstate, attnum, entries[i], NULL, 0); + collector->tuples[collector->ntuples + i]->t_tid = *item; + tupsize = IndexTupleSize(collector->tuples[collector->ntuples + i]); + + if ( tupsize > TOAST_INDEX_TARGET || tupsize >= GinMaxItemSize) + elog(ERROR, "huge tuple"); + + collector->sumsize += tupsize; + } + + collector->ntuples += nentries; + + return nentries; +} + +/* + * Deletes pending list pages up to (not including) newHead page. + * If newHead == InvalidBlockNumber then function drops the whole list. + * + * metapage is pinned and exclusive-locked throughout this function. + * + * Returns true if another cleanup process is running concurrently + * (if so, we can just abandon our own efforts) + */ +static bool +shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, + IndexBulkDeleteResult *stats) +{ + Page metapage; + GinMetaPageData *metadata; + BlockNumber blknoToDelete; + + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + blknoToDelete = metadata->head; + + do + { + Page page; + int i; + int64 nDeletedHeapTuples = 0; + ginxlogDeleteListPages data; + XLogRecData rdata[1]; + Buffer buffers[GIN_NDELETE_AT_ONCE]; + + data.node = index->rd_node; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogDeleteListPages); + rdata[0].next = NULL; + + data.ndeleted = 0; + while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) + { + data.toDelete[ data.ndeleted ] = blknoToDelete; + buffers[ data.ndeleted ] = ReadBuffer(index, blknoToDelete); + LockBuffer( buffers[ data.ndeleted ], GIN_EXCLUSIVE ); + page = BufferGetPage( buffers[ data.ndeleted ] ); + + data.ndeleted++; + + if ( GinPageIsDeleted(page) ) + { + /* concurrent cleanup process is detected */ + for(i=0;i<data.ndeleted;i++) + UnlockReleaseBuffer( buffers[i] ); + + return true; + } + + nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; + blknoToDelete = GinPageGetOpaque( page )->rightlink; + } + + if (stats) + stats->pages_deleted += data.ndeleted; + + START_CRIT_SECTION(); + + metadata->head = blknoToDelete; + + Assert( metadata->nPendingPages >= data.ndeleted ); + metadata->nPendingPages -= data.ndeleted; + Assert( metadata->nPendingHeapTuples >= nDeletedHeapTuples ); + metadata->nPendingHeapTuples -= nDeletedHeapTuples; + + if ( blknoToDelete == InvalidBlockNumber ) + { + metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + metadata->nPendingPages = 0; + metadata->nPendingHeapTuples = 0; + } + memcpy( &data.metadata, metadata, sizeof(GinMetaPageData)); + + MarkBufferDirty( metabuffer ); + + for(i=0; i<data.ndeleted; i++) + { + page = BufferGetPage( buffers[ i ] ); + GinPageGetOpaque( page )->flags = GIN_DELETED; + MarkBufferDirty( buffers[ i ] ); + } + + if ( !index->rd_istemp ) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata); + PageSetLSN(metapage, recptr); + PageSetTLI(metapage, ThisTimeLineID); + + for(i=0; i<data.ndeleted; i++) + { + page = BufferGetPage( buffers[ i ] ); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + } + + for(i=0; i<data.ndeleted; i++) + UnlockReleaseBuffer( buffers[ i ] ); + + END_CRIT_SECTION(); + } while( blknoToDelete != newHead ); + + return false; +} + +/* Add datum to DatumArray, resizing if needed */ +static void +addDatum(DatumArray *datums, Datum datum) +{ + if ( datums->nvalues >= datums->maxvalues) + { + datums->maxvalues *= 2; + datums->values = (Datum*)repalloc(datums->values, + sizeof(Datum)*datums->maxvalues); + } + + datums->values[ datums->nvalues++ ] = datum; +} + +/* + * Go through all tuples >= startoff on page and collect values in memory + * + * Note that da is just workspace --- it does not carry any state across + * calls. + */ +static void +processPendingPage(BuildAccumulator *accum, DatumArray *da, + Page page, OffsetNumber startoff) +{ + ItemPointerData heapptr; + OffsetNumber i,maxoff; + OffsetNumber attrnum, curattnum; + + /* reset *da to empty */ + da->nvalues = 0; + + maxoff = PageGetMaxOffsetNumber(page); + Assert( maxoff >= FirstOffsetNumber ); + ItemPointerSetInvalid(&heapptr); + attrnum = 0; + + for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + + curattnum = gintuple_get_attrnum(accum->ginstate, itup); + + if ( !ItemPointerIsValid(&heapptr) ) + { + heapptr = itup->t_tid; + attrnum = curattnum; + } + else if ( !(ItemPointerEquals(&heapptr, &itup->t_tid) && + curattnum == attrnum) ) + { + /* + * We can insert several datums per call, but only for one heap + * tuple and one column. + */ + ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues); + da->nvalues = 0; + heapptr = itup->t_tid; + attrnum = curattnum; + } + addDatum(da, gin_index_getattr(accum->ginstate, itup)); + } + + ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues); +} + +/* + * Move tuples from pending pages into regular GIN structure. + * + * This can be called concurrently by multiple backends, so it must cope. + * On first glance it looks completely not concurrent-safe and not crash-safe + * either. The reason it's okay is that multiple insertion of the same entry + * is detected and treated as a no-op by gininsert.c. If we crash after + * posting entries to the main index and before removing them from the + * pending list, it's okay because when we redo the posting later on, nothing + * bad will happen. Likewise, if two backends simultaneously try to post + * a pending entry into the main index, one will succeed and one will do + * nothing. We try to notice when someone else is a little bit ahead of + * us in the process, but that's just to avoid wasting cycles. Only the + * action of removing a page from the pending list really needs exclusive + * lock. + * + * vac_delay indicates that ginInsertCleanup is called from vacuum process, + * so call vacuum_delay_point() periodically. + * If stats isn't null, we count deleted pending pages into the counts. + */ +void +ginInsertCleanup(Relation index, GinState *ginstate, + bool vac_delay, IndexBulkDeleteResult *stats) +{ + Buffer metabuffer, buffer; + Page metapage, page; + GinMetaPageData *metadata; + MemoryContext opCtx, oldCtx; + BuildAccumulator accum; + DatumArray datums; + BlockNumber blkno; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + if ( metadata->head == InvalidBlockNumber ) + { + /* Nothing to do */ + UnlockReleaseBuffer(metabuffer); + return; + } + + /* + * Read and lock head of pending list + */ + blkno = metadata->head; + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * Initialize. All temporary space will be in opCtx + */ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "GIN insert cleanup temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + oldCtx = MemoryContextSwitchTo(opCtx); + + datums.maxvalues=128; + datums.nvalues = 0; + datums.values = (Datum*)palloc(sizeof(Datum)*datums.maxvalues); + + ginInitBA(&accum); + accum.ginstate = ginstate; + + /* + * At the top of this loop, we have pin and lock on the current page + * of the pending list. However, we'll release that before exiting + * the loop. Note we also have pin but not lock on the metapage. + */ + for(;;) + { + if ( GinPageIsDeleted(page) ) + { + /* another cleanup process is running concurrently */ + UnlockReleaseBuffer( buffer ); + break; + } + + /* + * read page's datums into memory + */ + processPendingPage(&accum, &datums, page, FirstOffsetNumber); + + if (vac_delay) + vacuum_delay_point(); + + /* + * Is it time to flush memory to disk? Flush if we are at the end + * of the pending list, or if we have a full row and memory is + * getting full. + * + * XXX using up maintenance_work_mem here is probably unreasonably + * much, since vacuum might already be using that much. + */ + if ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || + ( GinPageHasFullRow(page) && + accum.allocatedMemory > maintenance_work_mem * 1024L ) ) + { + ItemPointerData *list; + uint32 nlist; + Datum entry; + OffsetNumber maxoff, attnum; + + /* + * Unlock current page to increase performance. + * Changes of page will be checked later by comparing + * maxoff after completion of memory flush. + */ + maxoff = PageGetMaxOffsetNumber(page); + LockBuffer(buffer, GIN_UNLOCK); + + /* + * Moving collected data into regular structure can take + * significant amount of time - so, run it without locking pending + * list. + */ + while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) + { + ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE); + if (vac_delay) + vacuum_delay_point(); + } + + /* + * Lock the whole list to remove pages + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + LockBuffer(buffer, GIN_SHARE); + + if ( GinPageIsDeleted(page) ) + { + /* another cleanup process is running concurrently */ + UnlockReleaseBuffer(buffer); + LockBuffer(metabuffer, GIN_UNLOCK); + break; + } + + /* + * While we left the page unlocked, more stuff might have gotten + * added to it. If so, process those entries immediately. There + * shouldn't be very many, so we don't worry about the fact that + * we're doing this with exclusive lock. Insertion algorithm + * gurantees that inserted row(s) will not continue on next page. + * NOTE: intentionally no vacuum_delay_point in this loop. + */ + if ( PageGetMaxOffsetNumber(page) != maxoff ) + { + ginInitBA(&accum); + processPendingPage(&accum, &datums, page, maxoff+1); + + while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) + ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE); + } + + /* + * Remember next page - it will become the new list head + */ + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); /* shiftList will do exclusive locking */ + + /* + * remove readed pages from pending list, at this point all + * content of readed pages is in regular structure + */ + if ( shiftList(index, metabuffer, blkno, stats) ) + { + /* another cleanup process is running concurrently */ + LockBuffer(metabuffer, GIN_UNLOCK); + break; + } + + Assert( blkno == metadata->head ); + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * if we removed the whole pending list just exit + */ + if ( blkno == InvalidBlockNumber ) + break; + + /* + * release memory used so far and reinit state + */ + MemoryContextReset(opCtx); + ginInitBA(&accum); + datums.nvalues = 0; + datums.values = (Datum*)palloc(sizeof(Datum)*datums.maxvalues); + } + else + { + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); + } + + /* + * Read next page in pending list + */ + CHECK_FOR_INTERRUPTS(); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + } + + ReleaseBuffer(metabuffer); + + /* Clean up temporary space */ + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(opCtx); +} diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 182981498c1..7f9f1236605 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.22 2009/01/10 21:08:36 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.23 2009/03/24 20:17:10 tgl Exp $ *------------------------------------------------------------------------- */ @@ -23,6 +23,15 @@ #include "utils/memutils.h" +typedef struct pendingPosition +{ + Buffer pendingBuffer; + OffsetNumber firstOffset; + OffsetNumber lastOffset; + ItemPointerData item; +} pendingPosition; + + /* * Tries to refind previously taken ItemPointer on page. */ @@ -258,7 +267,7 @@ computePartialMatchList( GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry } /* - * Start* functions setup begining state of searches: finds correct buffer and pins it. + * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) @@ -268,6 +277,15 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) Page page; bool needUnlock = TRUE; + entry->buffer = InvalidBuffer; + entry->offset = InvalidOffsetNumber; + entry->list = NULL; + entry->nlist = 0; + entry->partialMatch = NULL; + entry->partialMatchResult = NULL; + entry->reduceResult = FALSE; + entry->predictNumberResult = 0; + if (entry->master != NULL) { entry->isFinished = entry->master->isFinished; @@ -285,15 +303,6 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) page = BufferGetPage(stackEntry->buffer); entry->isFinished = TRUE; - entry->buffer = InvalidBuffer; - entry->offset = InvalidOffsetNumber; - entry->list = NULL; - entry->nlist = 0; - entry->partialMatch = NULL; - entry->partialMatchIterator = NULL; - entry->partialMatchResult = NULL; - entry->reduceResult = FALSE; - entry->predictNumberResult = 0; if ( entry->isPartialMatch ) { @@ -354,9 +363,10 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) entry->buffer = scanBeginPostingTree(gdi); /* - * We keep buffer pinned because we need to prevent deletition + * We keep buffer pinned because we need to prevent deletion of * page during scan. See GIN's vacuum implementation. RefCount - * is increased to keep buffer pinned after freeGinBtreeStack() call. + * is increased to keep buffer pinned after freeGinBtreeStack() + * call. */ IncrBufferRefCount(entry->buffer); @@ -536,9 +546,10 @@ entryGetItem(Relation index, GinScanEntry entry) { do { - if ( entry->partialMatchResult == NULL || entry->offset >= entry->partialMatchResult->ntuples ) + if (entry->partialMatchResult == NULL || + entry->offset >= entry->partialMatchResult->ntuples) { - entry->partialMatchResult = tbm_iterate( entry->partialMatchIterator ); + entry->partialMatchResult = tbm_iterate(entry->partialMatchIterator); if ( entry->partialMatchResult == NULL ) { @@ -548,23 +559,37 @@ entryGetItem(Relation index, GinScanEntry entry) entry->isFinished = TRUE; break; } - else if ( entry->partialMatchResult->ntuples < 0 ) - { - /* bitmap became lossy */ - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("not enough memory to store result of partial match operator" ), - errhint("Increase the \"work_mem\" parameter."))); - } + + /* + * reset counter to the beginning of entry->partialMatchResult. + * Note: entry->offset is still greater than + * partialMatchResult->ntuples if partialMatchResult is + * lossy. So, on next call we will get next result from + * TIDBitmap. + */ entry->offset = 0; } - ItemPointerSet(&entry->curItem, - entry->partialMatchResult->blockno, - entry->partialMatchResult->offsets[ entry->offset ]); - entry->offset ++; + if ( entry->partialMatchResult->ntuples < 0 ) + { + /* + * lossy result, so we need to check the whole page + */ + ItemPointerSetLossyPage(&entry->curItem, + entry->partialMatchResult->blockno); + /* + * We might as well fall out of the loop; we could not + * estimate number of results on this page to support correct + * reducing of result even if it's enabled + */ + break; + } - } while (entry->isFinished == FALSE && entry->reduceResult == TRUE && dropItem(entry)); + ItemPointerSet(&entry->curItem, + entry->partialMatchResult->blockno, + entry->partialMatchResult->offsets[entry->offset]); + entry->offset++; + } while (entry->reduceResult == TRUE && dropItem(entry)); } else if (!BufferIsValid(entry->buffer)) { @@ -618,6 +643,10 @@ keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, if (key->entryRes[i]) { + /* + * Move forward only entries which was the least + * on previous call + */ if (entry->isFinished == FALSE && entryGetItem(index, entry) == FALSE) { if (compareItemPointers(&entry->curItem, &key->curItem) < 0) @@ -664,6 +693,13 @@ keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, */ *keyrecheck = true; + /* + * If one of the entry's scans returns lossy result, return it without + * checking - we can't suggest anything helpful to consistentFn. + */ + if (ItemPointerIsLossyPage(&key->curItem)) + return FALSE; + oldCtx = MemoryContextSwitchTo(tempCtx); res = DatumGetBool(FunctionCall4(&ginstate->consistentFn[key->attnum-1], PointerGetDatum(key->entryRes), @@ -677,6 +713,337 @@ keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, return FALSE; } + +/* + * Get ItemPointer of next heap row to be checked from pending list. + * Returns false if there are no more. + * + * The pendingBuffer is presumed pinned and share-locked on entry, and is + * pinned and share-locked on success exit. On failure exit it's released. + */ +static bool +scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) +{ + OffsetNumber maxoff; + Page page; + IndexTuple itup; + + ItemPointerSetInvalid( &pos->item ); + for(;;) + { + page = BufferGetPage(pos->pendingBuffer); + + maxoff = PageGetMaxOffsetNumber(page); + if ( pos->firstOffset > maxoff ) + { + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + if ( blkno == InvalidBlockNumber ) + { + UnlockReleaseBuffer(pos->pendingBuffer); + pos->pendingBuffer=InvalidBuffer; + + return false; + } + else + { + /* + * Here we must prevent deletion of next page by + * insertcleanup process, which may be trying to obtain + * exclusive lock on current page. So, we lock next + * page before releasing the current one + */ + Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); + + LockBuffer(tmpbuf, GIN_SHARE); + UnlockReleaseBuffer(pos->pendingBuffer); + + pos->pendingBuffer = tmpbuf; + pos->firstOffset = FirstOffsetNumber; + } + } + else + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); + pos->item = itup->t_tid; + if ( GinPageHasFullRow(page) ) + { + /* + * find itempointer to the next row + */ + for(pos->lastOffset = pos->firstOffset+1; pos->lastOffset<=maxoff; pos->lastOffset++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); + if (!ItemPointerEquals(&pos->item, &itup->t_tid)) + break; + } + } + else + { + /* + * All itempointers are the same on this page + */ + pos->lastOffset = maxoff + 1; + } + break; + } + } + + return true; +} + +static bool +matchPartialInPendingList(GinState *ginstate, Page page, + OffsetNumber off, OffsetNumber maxoff, + Datum value, OffsetNumber attrnum, + Datum *datum, bool *datumExtracted, + StrategyNumber strategy) +{ + IndexTuple itup; + int res; + + while ( off < maxoff ) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + if ( attrnum != gintuple_get_attrnum(ginstate, itup) ) + return false; + + if (datumExtracted[ off-1 ] == false) + { + datum[ off-1 ] = gin_index_getattr(ginstate, itup); + datumExtracted[ off-1 ] = true; + } + + res = DatumGetInt32(FunctionCall3(&ginstate->comparePartialFn[attrnum], + value, + datum[ off-1 ], + UInt16GetDatum(strategy))); + if ( res == 0 ) + return true; + else if (res>0) + return false; + } + + return false; +} + +/* + * Sets entryRes array for each key by looking at + * every entry per indexed value (row) in pending list. + * returns true if at least one of datum was matched by key's entry + * + * The pendingBuffer is presumed pinned and share-locked on entry. + */ +static bool +collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + OffsetNumber attrnum; + Page page; + IndexTuple itup; + int i, j; + bool hasMatch = false; + + /* + * Resets entryRes + */ + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + memset( key->entryRes, FALSE, key->nentries ); + } + + for(;;) + { + Datum datum[ BLCKSZ/sizeof(IndexTupleData) ]; + bool datumExtracted[ BLCKSZ/sizeof(IndexTupleData) ]; + + Assert( pos->lastOffset > pos->firstOffset ); + memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset )); + + page = BufferGetPage(pos->pendingBuffer); + + for(i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + for(j=0; j<key->nentries; j++) + { + OffsetNumber StopLow = pos->firstOffset, + StopHigh = pos->lastOffset, + StopMiddle; + GinScanEntry entry = key->scanEntry + j; + + if ( key->entryRes[j] ) + continue; + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); + attrnum = gintuple_get_attrnum(&so->ginstate, itup); + + if (key->attnum < attrnum) + StopHigh = StopMiddle; + else if (key->attnum > attrnum) + StopLow = StopMiddle + 1; + else + { + int res; + + if (datumExtracted[ StopMiddle-1 ] == false) + { + datum[ StopMiddle-1 ] = gin_index_getattr(&so->ginstate, itup); + datumExtracted[ StopMiddle-1 ] = true; + } + res = compareEntries(&so->ginstate, + entry->attnum, + entry->entry, + datum[ StopMiddle-1 ]); + + if ( res == 0 ) + { + if ( entry->isPartialMatch ) + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, StopMiddle, + pos->lastOffset, + entry->entry, + entry->attnum, + datum, + datumExtracted, + entry->strategy); + else + key->entryRes[j] = true; + break; + } + else if ( res < 0 ) + StopHigh = StopMiddle; + else + StopLow = StopMiddle + 1; + } + } + + if ( StopLow>=StopHigh && entry->isPartialMatch ) + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, StopHigh, + pos->lastOffset, + entry->entry, + entry->attnum, + datum, + datumExtracted, + entry->strategy); + + hasMatch |= key->entryRes[j]; + } + } + + pos->firstOffset = pos->lastOffset; + + if ( GinPageHasFullRow(page) ) + { + /* + * We scan all values from one tuple, go to next one + */ + + return hasMatch; + } + else + { + ItemPointerData item = pos->item; + + if ( scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item) ) + elog(ERROR,"Could not process tuple"); /* XXX should not be here ! */ + } + } + + return hasMatch; +} + +/* + * Collect all matched rows from pending list in bitmap + */ +static void +scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + MemoryContext oldCtx; + bool recheck, keyrecheck, match; + int i; + pendingPosition pos; + Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); + BlockNumber blkno; + + *ntids = 0; + + LockBuffer(metabuffer, GIN_SHARE); + blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head; + + /* + * fetch head of list before unlocking metapage. + * head page must be pinned to prevent deletion by vacuum process + */ + if ( blkno == InvalidBlockNumber ) + { + /* No pending list, so proceed with normal scan */ + UnlockReleaseBuffer( metabuffer ); + return; + } + + pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); + LockBuffer(pos.pendingBuffer, GIN_SHARE); + pos.firstOffset = FirstOffsetNumber; + UnlockReleaseBuffer( metabuffer ); + + /* + * loop for each heap row + */ + while( scanGetCandidate(scan, &pos) ) + { + + /* + * Check entries in rows and setup entryRes array + */ + if (!collectDatumForItem(scan, &pos)) + continue; + + /* + * check for consistent + */ + oldCtx = MemoryContextSwitchTo(so->tempCtx); + recheck = false; + match = true; + + for (i = 0; match && i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + keyrecheck = true; + + if ( DatumGetBool(FunctionCall4(&so->ginstate.consistentFn[ key->attnum-1 ], + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + PointerGetDatum(&keyrecheck))) == false ) + { + match = false; + } + + recheck |= keyrecheck; + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(so->tempCtx); + + if ( match ) + { + tbm_add_tuples(tbm, &pos.item, 1, recheck); + (*ntids)++; + } + } +} + /* * Get heap item pointer from scan * returns true if found @@ -720,6 +1087,18 @@ scanGetItem(IndexScanDesc scan, ItemPointerData *item, bool *recheck) { int cmp = compareItemPointers(item, &key->curItem); + if ( cmp != 0 && (ItemPointerIsLossyPage(item) || ItemPointerIsLossyPage(&key->curItem)) ) + { + /* + * if one of ItemPointers points to the whole page then + * compare only page's number + */ + if ( ItemPointerGetBlockNumber(item) == ItemPointerGetBlockNumber(&key->curItem) ) + cmp = 0; + else + cmp = (ItemPointerGetBlockNumber(item) > ItemPointerGetBlockNumber(&key->curItem)) ? 1 : -1; + } + if (cmp == 0) break; else if (cmp > 0) @@ -757,9 +1136,26 @@ gingetbitmap(PG_FUNCTION_ARGS) if (GinIsVoidRes(scan)) PG_RETURN_INT64(0); + ntids = 0; + + /* + * First, scan the pending list and collect any matching entries into + * the bitmap. After we scan a pending item, some other backend could + * post it into the main index, and so we might visit it a second time + * during the main scan. This is okay because we'll just re-set the + * same bit in the bitmap. (The possibility of duplicate visits is a + * major reason why GIN can't support the amgettuple API, however.) + * Note that it would not do to scan the main index before the pending + * list, since concurrent cleanup could then make us miss entries + * entirely. + */ + scanPendingInsert(scan, tbm, &ntids); + + /* + * Now scan the main index. + */ startScan(scan); - ntids = 0; for (;;) { ItemPointerData iptr; @@ -770,31 +1166,12 @@ gingetbitmap(PG_FUNCTION_ARGS) if (!scanGetItem(scan, &iptr, &recheck)) break; - tbm_add_tuples(tbm, &iptr, 1, recheck); + if ( ItemPointerIsLossyPage(&iptr) ) + tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); + else + tbm_add_tuples(tbm, &iptr, 1, recheck); ntids++; } PG_RETURN_INT64(ntids); } - -Datum -gingettuple(PG_FUNCTION_ARGS) -{ - IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); - bool res; - - if (dir != ForwardScanDirection) - elog(ERROR, "GIN doesn't support other scan directions than forward"); - - if (GinIsNewKey(scan)) - newScanKey(scan); - - if (GinIsVoidRes(scan)) - PG_RETURN_BOOL(false); - - startScan(scan); - res = scanGetItem(scan, &scan->xs_ctup.t_self, &scan->xs_recheck); - - PG_RETURN_BOOL(res); -} diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 2ab1105423c..d05882cdb94 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.18 2009/01/01 17:23:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.19 2009/03/24 20:17:11 tgl Exp $ *------------------------------------------------------------------------- */ @@ -138,9 +138,11 @@ addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, /* * Inserts only one entry to the index, but it can add more than 1 ItemPointer. */ -static void -ginEntryInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datum value, - ItemPointerData *items, uint32 nitem, bool isBuild) +void +ginEntryInsert(Relation index, GinState *ginstate, + OffsetNumber attnum, Datum value, + ItemPointerData *items, uint32 nitem, + bool isBuild) { GinBtreeData btree; GinBtreeStack *stack; @@ -273,7 +275,7 @@ ginbuild(PG_FUNCTION_ARGS) IndexBuildResult *result; double reltuples; GinBuildState buildstate; - Buffer buffer; + Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum entry; uint32 nlist; @@ -286,11 +288,17 @@ ginbuild(PG_FUNCTION_ARGS) initGinState(&buildstate.ginstate, index); + /* initialize the meta page */ + MetaBuffer = GinNewBuffer(index); + /* initialize the root page */ - buffer = GinNewBuffer(index); + RootBuffer = GinNewBuffer(index); + START_CRIT_SECTION(); - GinInitBuffer(buffer, GIN_LEAF); - MarkBufferDirty(buffer); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); if (!index->rd_istemp) { @@ -303,16 +311,19 @@ ginbuild(PG_FUNCTION_ARGS) rdata.len = sizeof(RelFileNode); rdata.next = NULL; - page = BufferGetPage(buffer); - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); + + page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); + page = BufferGetPage(MetaBuffer); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); } - UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* build the index */ @@ -417,9 +428,26 @@ gininsert(PG_FUNCTION_ARGS) initGinState(&ginstate, index); - for(i=0; i<ginstate.origTupdesc->natts;i++) - if ( !isnull[i] ) - res += ginHeapTupleInsert(index, &ginstate, (OffsetNumber)(i+1), values[i], ht_ctid); + if ( GinGetUseFastUpdate(index) ) + { + GinTupleCollector collector; + + memset(&collector, 0, sizeof(GinTupleCollector)); + for(i=0; i<ginstate.origTupdesc->natts;i++) + if ( !isnull[i] ) + res += ginHeapTupleFastCollect(index, &ginstate, &collector, + (OffsetNumber)(i+1), values[i], ht_ctid); + + ginHeapTupleFastInsert(index, &ginstate, &collector); + } + else + { + for(i=0; i<ginstate.origTupdesc->natts;i++) + if ( !isnull[i] ) + res += ginHeapTupleInsert(index, &ginstate, + (OffsetNumber)(i+1), values[i], ht_ctid); + + } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 222ea677883..e0951a6a4f8 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.20 2009/01/05 17:14:28 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.21 2009/03/24 20:17:11 tgl Exp $ *------------------------------------------------------------------------- */ @@ -57,7 +57,7 @@ initGinState(GinState *state, Relation index) CurrentMemoryContext); /* - * Check opclass capability to do partial match. + * Check opclass capability to do partial match. */ if ( index_getprocid(index, i+1, GIN_COMPARE_PARTIAL_PROC) != InvalidOid ) { @@ -88,7 +88,7 @@ gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple) bool isnull; /* - * First attribute is always int16, so we can safely use any + * First attribute is always int16, so we can safely use any * tuple descriptor to obtain first attribute of tuple */ res = index_getattr(tuple, FirstOffsetNumber, ginstate->tupdesc[0], @@ -213,6 +213,22 @@ GinInitBuffer(Buffer b, uint32 f) GinInitPage(BufferGetPage(b), f, BufferGetPageSize(b)); } +void +GinInitMetabuffer(Buffer b) +{ + GinMetaPageData *metadata; + Page page = BufferGetPage(b); + + GinInitPage(page, GIN_META, BufferGetPageSize(b)); + + metadata = GinPageGetMeta(page); + + metadata->head = metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + metadata->nPendingPages = 0; + metadata->nPendingHeapTuples = 0; +} + int compareEntries(GinState *ginstate, OffsetNumber attnum, Datum a, Datum b) { @@ -315,10 +331,26 @@ ginoptions(PG_FUNCTION_ARGS) { Datum reloptions = PG_GETARG_DATUM(0); bool validate = PG_GETARG_BOOL(1); - bytea *result; + relopt_value *options; + GinOptions *rdopts; + int numoptions; + static const relopt_parse_elt tab[] = { + {"fastupdate", RELOPT_TYPE_BOOL, offsetof(GinOptions, useFastUpdate)} + }; + + options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIN, + &numoptions); + + /* if none set, we're done */ + if (numoptions == 0) + PG_RETURN_NULL(); + + rdopts = allocateReloptStruct(sizeof(GinOptions), options, numoptions); + + fillRelOptions((void *) rdopts, sizeof(GinOptions), options, numoptions, + validate, tab, lengthof(tab)); + + pfree(options); - result = default_reloptions(reloptions, validate, RELOPT_KIND_GIN); - if (result) - PG_RETURN_BYTEA_P(result); - PG_RETURN_NULL(); + PG_RETURN_BYTEA_P(rdopts); } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index be614a3c9c8..dd98b9fd284 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.27 2009/01/01 17:23:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.28 2009/03/24 20:17:11 tgl Exp $ *------------------------------------------------------------------------- */ @@ -19,8 +19,8 @@ #include "catalog/storage.h" #include "commands/vacuum.h" #include "miscadmin.h" +#include "postmaster/autovacuum.h" #include "storage/bufmgr.h" -#include "storage/freespace.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" @@ -593,18 +593,24 @@ ginbulkdelete(PG_FUNCTION_ARGS) BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; + gvs.index = index; + gvs.callback = callback; + gvs.callback_state = callback_state; + gvs.strategy = info->strategy; + initGinState(&gvs.ginstate, index); + /* first time through? */ if (stats == NULL) + { + /* Yes, so initialize stats to zeroes */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + /* and cleanup any pending inserts */ + ginInsertCleanup(index, &gvs.ginstate, true, stats); + } + /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; - - gvs.index = index; gvs.result = stats; - gvs.callback = callback; - gvs.callback_state = callback_state; - gvs.strategy = info->strategy; - initGinState(&gvs.ginstate, index); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); @@ -702,10 +708,32 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) BlockNumber totFreePages; BlockNumber lastBlock = GIN_ROOT_BLKNO, lastFilledBlock = GIN_ROOT_BLKNO; + GinState ginstate; - /* Set up all-zero stats if ginbulkdelete wasn't called */ + /* + * In an autovacuum analyze, we want to clean up pending insertions. + * Otherwise, an ANALYZE-only call is a no-op. + */ + if (info->analyze_only) + { + if (IsAutoVacuumWorkerProcess()) + { + initGinState(&ginstate, index); + ginInsertCleanup(index, &ginstate, true, stats); + } + PG_RETURN_POINTER(stats); + } + + /* + * Set up all-zero stats and cleanup pending inserts + * if ginbulkdelete wasn't called + */ if (stats == NULL) + { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + initGinState(&ginstate, index); + ginInsertCleanup(index, &ginstate, true, stats); + } /* * XXX we always report the heap tuple count as the number of index diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 362709de330..03cdc1129cf 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.17 2009/01/20 18:59:36 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.18 2009/03/24 20:17:11 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" @@ -71,20 +71,30 @@ static void ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) { RelFileNode *node = (RelFileNode *) XLogRecGetData(record); - Buffer buffer; + Buffer RootBuffer, MetaBuffer; Page page; - buffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true); - Assert(BufferIsValid(buffer)); - page = (Page) BufferGetPage(buffer); + MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true); + Assert(BufferIsValid(MetaBuffer)); + GinInitMetabuffer(MetaBuffer); + + page = (Page) BufferGetPage(MetaBuffer); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); - GinInitBuffer(buffer, GIN_LEAF); + RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true); + Assert(BufferIsValid(RootBuffer)); + page = (Page) BufferGetPage(RootBuffer); + + GinInitBuffer(RootBuffer, GIN_LEAF); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(MetaBuffer); + UnlockReleaseBuffer(MetaBuffer); + MarkBufferDirty(RootBuffer); + UnlockReleaseBuffer(RootBuffer); } static void @@ -433,6 +443,174 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) } } +static void +ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogUpdateMeta *data = (ginxlogUpdateMeta*) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + + metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); + metapage = BufferGetPage(metabuffer); + + if (!XLByteLE(lsn, PageGetLSN(metapage))) + { + memcpy( GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + PageSetTLI(metapage, ThisTimeLineID); + MarkBufferDirty(metabuffer); + } + + if ( data->ntuples > 0 ) + { + /* + * insert into tail page + */ + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + Buffer buffer = XLogReadBuffer(data->node, data->metadata.tail, false); + Page page = BufferGetPage(buffer); + + if ( !XLByteLE(lsn, PageGetLSN(page))) + { + OffsetNumber l, off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + int i, tupsize; + IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta)); + + for(i=0; i<data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); + + l = PageAddItem(page, (Item)tuples, tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple)( ((char*)tuples) + tupsize ); + } + + /* + * Increase counter of heap tuples + */ + GinPageGetOpaque(page)->maxoff++; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + else if ( data->prevTail != InvalidBlockNumber ) + { + /* + * New tail + */ + + Buffer buffer = XLogReadBuffer(data->node, data->prevTail, false); + Page page = BufferGetPage(buffer); + + if ( !XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->rightlink = data->newRightlink; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + + UnlockReleaseBuffer(metabuffer); +} + +static void +ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogInsertListPage *data = (ginxlogInsertListPage*) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber l, off = FirstOffsetNumber; + int i, tupsize; + IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsertListPage)); + + if (record->xl_info & XLR_BKP_BLOCK_1) + return; + + buffer = XLogReadBuffer(data->node, data->blkno, true); + page = BufferGetPage(buffer); + + GinInitBuffer(buffer, GIN_LIST); + GinPageGetOpaque(page)->rightlink = data->rightlink; + if ( data->rightlink == InvalidBlockNumber ) + { + /* tail of sublist */ + GinPageSetFullRow(page); + GinPageGetOpaque(page)->maxoff = 1; + } + else + { + GinPageGetOpaque(page)->maxoff = 0; + } + + for(i=0; i<data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); + + l = PageAddItem(page, (Item)tuples, tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple)( ((char*)tuples) + tupsize ); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogDeleteListPages *data = (ginxlogDeleteListPages*) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + int i; + + metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); + metapage = BufferGetPage(metabuffer); + + if (!XLByteLE(lsn, PageGetLSN(metapage))) + { + memcpy( GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + PageSetTLI(metapage, ThisTimeLineID); + MarkBufferDirty(metabuffer); + } + + for(i=0; i<data->ndeleted; i++) + { + Buffer buffer = XLogReadBuffer(data->node,data->toDelete[i],false); + Page page = BufferGetPage(buffer); + + if ( !XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->flags = GIN_DELETED; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + + UnlockReleaseBuffer(buffer); + } + UnlockReleaseBuffer(metabuffer); +} + void gin_redo(XLogRecPtr lsn, XLogRecord *record) { @@ -461,6 +639,15 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record) case XLOG_GIN_DELETE_PAGE: ginRedoDeletePage(lsn, record); break; + case XLOG_GIN_UPDATE_META_PAGE: + ginRedoUpdateMetapage(lsn, record); + break; + case XLOG_GIN_INSERT_LISTPAGE: + ginRedoInsertListPage(lsn, record); + break; + case XLOG_GIN_DELETE_LISTPAGE: + ginRedoDeleteListPages(lsn, record); + break; default: elog(PANIC, "gin_redo: unknown op code %u", info); } @@ -516,6 +703,18 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec) appendStringInfo(buf, "Delete page, "); desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno); break; + case XLOG_GIN_UPDATE_META_PAGE: + appendStringInfo(buf, "Update metapage, "); + desc_node(buf, ((ginxlogUpdateMeta *) rec)->node, ((ginxlogUpdateMeta *) rec)->metadata.tail); + break; + case XLOG_GIN_INSERT_LISTPAGE: + appendStringInfo(buf, "Insert new list page, "); + desc_node(buf, ((ginxlogInsertListPage *) rec)->node, ((ginxlogInsertListPage *) rec)->blkno); + break; + case XLOG_GIN_DELETE_LISTPAGE: + appendStringInfo(buf, "Delete list pages (%d), ", ((ginxlogDeleteListPages *) rec)->ndeleted); + desc_node(buf, ((ginxlogDeleteListPages *) rec)->node, ((ginxlogDeleteListPages *) rec)->metadata.head); + break; default: elog(PANIC, "gin_desc: unknown op code %u", info); } diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index fcf471cf2e9..01b8512d070 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.42 2009/01/01 17:23:35 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.43 2009/03/24 20:17:11 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -515,6 +515,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + PG_RETURN_POINTER(stats); + /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index ab2f67c6385..42fe9554f0f 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.108 2009/01/01 17:23:35 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.109 2009/03/24 20:17:11 tgl Exp $ * * NOTES * This file contains only the public interface routines. @@ -647,6 +647,7 @@ hashvacuumcleanup(PG_FUNCTION_ARGS) BlockNumber num_pages; /* If hashbulkdelete wasn't called, return NULL signifying no change */ + /* Note: this covers the analyze_only case too */ if (stats == NULL) PG_RETURN_POINTER(NULL); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index a03b4595ba1..197fa3b041d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.112 2009/01/01 17:23:35 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.113 2009/03/24 20:17:12 tgl Exp $ * * INTERFACE ROUTINES * index_open - open an index relation by relation OID @@ -647,7 +647,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) * item slot could have been replaced by a newer tuple by the time we get * to it. * - * Returns the number of matching tuples found. + * Returns the number of matching tuples found. (Note: this might be only + * approximate, so it should only be used for statistical purposes.) * ---------------- */ int64 diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 84f409e1aca..b8bb1ad4906 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.167 2009/01/01 17:23:35 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.168 2009/03/24 20:17:12 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -557,6 +557,10 @@ btvacuumcleanup(PG_FUNCTION_ARGS) IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + PG_RETURN_POINTER(stats); + /* * If btbulkdelete was called, we need not do anything, just return the * stats from the latest btbulkdelete call. If it wasn't called, we must diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index e53f4f52dcf..d1889e16c2c 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.312 2009/01/22 20:16:01 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.313 2009/03/24 20:17:12 tgl Exp $ * * * INTERFACE ROUTINES @@ -1938,6 +1938,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) */ ivinfo.index = indexRelation; ivinfo.vacuum_full = false; + ivinfo.analyze_only = false; ivinfo.message_level = DEBUG2; ivinfo.num_heap_tuples = -1; ivinfo.strategy = NULL; diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 33447b671f1..176ebde0efd 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.133 2009/01/22 20:16:01 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.134 2009/03/24 20:17:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -496,6 +496,28 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt, /* We skip to here if there were no analyzable columns */ cleanup: + /* If this isn't part of VACUUM ANALYZE, let index AMs do cleanup */ + if (!vacstmt->vacuum) + { + for (ind = 0; ind < nindexes; ind++) + { + IndexBulkDeleteResult *stats; + IndexVacuumInfo ivinfo; + + ivinfo.index = Irel[ind]; + ivinfo.vacuum_full = false; + ivinfo.analyze_only = true; + ivinfo.message_level = elevel; + ivinfo.num_heap_tuples = -1; /* not known for sure */ + ivinfo.strategy = vac_strategy; + + stats = index_vacuum_cleanup(&ivinfo, NULL); + + if (stats) + pfree(stats); + } + } + /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 4020bf1b294..78b179827ea 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.385 2009/01/16 13:27:23 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.386 2009/03/24 20:17:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -3388,6 +3388,7 @@ scan_index(Relation indrel, double num_tuples) ivinfo.index = indrel; ivinfo.vacuum_full = true; + ivinfo.analyze_only = false; ivinfo.message_level = elevel; ivinfo.num_heap_tuples = num_tuples; ivinfo.strategy = vac_strategy; @@ -3454,6 +3455,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel, ivinfo.index = indrel; ivinfo.vacuum_full = true; + ivinfo.analyze_only = false; ivinfo.message_level = elevel; ivinfo.num_heap_tuples = num_tuples + keep_tuples; ivinfo.strategy = vac_strategy; diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 4e4624cb132..cb73cfa87a7 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -29,7 +29,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.118 2009/01/22 19:25:00 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.119 2009/03/24 20:17:14 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -875,6 +875,7 @@ lazy_vacuum_index(Relation indrel, ivinfo.index = indrel; ivinfo.vacuum_full = false; + ivinfo.analyze_only = false; ivinfo.message_level = elevel; /* We don't yet know rel_tuples, so pass -1 */ ivinfo.num_heap_tuples = -1; @@ -906,6 +907,7 @@ lazy_cleanup_index(Relation indrel, ivinfo.index = indrel; ivinfo.vacuum_full = false; + ivinfo.analyze_only = false; ivinfo.message_level = elevel; ivinfo.num_heap_tuples = vacrelstats->rel_tuples; ivinfo.strategy = vac_strategy; diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index e214bbb7634..e56b4696b4f 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -32,7 +32,7 @@ * Copyright (c) 2003-2009, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.17 2009/01/10 21:08:36 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.18 2009/03/24 20:17:14 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -310,6 +310,22 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, } /* + * tbm_add_page - add a whole page to a TIDBitmap + * + * This causes the whole page to be reported (with the recheck flag) + * when the TIDBitmap is scanned. + */ +void +tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) +{ + /* Enter the page in the bitmap, or mark it lossy if already present */ + tbm_mark_page_lossy(tbm, pageno); + /* If we went over the memory limit, lossify some more pages */ + if (tbm->nentries > tbm->maxentries) + tbm_lossify(tbm); +} + +/* * tbm_union - set union * * a is modified in-place, b is not changed @@ -496,7 +512,7 @@ tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) { /* * Some of the tuples in 'a' might not satisfy the quals for 'b', - * but because the page 'b' is lossy, we don't know which ones. + * but because the page 'b' is lossy, we don't know which ones. * Therefore we mark 'a' as requiring rechecks, to indicate that * at most those tuples set in 'a' are matches. */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index b1048504c2c..65fd7f73310 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.75 2009/01/01 17:23:55 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.76 2009/03/24 20:17:14 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -41,6 +41,7 @@ typedef struct IndexVacuumInfo { Relation index; /* the index being vacuumed */ bool vacuum_full; /* VACUUM FULL (we have exclusive lock) */ + bool analyze_only; /* ANALYZE (without any actual vacuum) */ int message_level; /* ereport level for progress messages */ double num_heap_tuples; /* tuples remaining in heap */ BufferAccessStrategy strategy; /* access strategy for reads */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 1425333221d..f0f45bc5e8a 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -4,11 +4,9 @@ * * Copyright (c) 2006-2009, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.28 2009/01/10 21:08:36 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.29 2009/03/24 20:17:14 tgl Exp $ *-------------------------------------------------------------------------- */ - - #ifndef GIN_H #define GIN_H @@ -16,11 +14,6 @@ #include "access/itup.h" #include "access/xlog.h" #include "fmgr.h" -#include "nodes/tidbitmap.h" -#include "storage/block.h" -#include "storage/buf.h" -#include "storage/off.h" -#include "storage/relfilenode.h" /* @@ -43,20 +36,52 @@ typedef struct GinPageOpaqueData { BlockNumber rightlink; /* next page if any */ - OffsetNumber maxoff; /* number entries on GIN_DATA page: number of + OffsetNumber maxoff; /* number entries on GIN_DATA page; number of * heap ItemPointer on GIN_DATA|GIN_LEAF page * and number of records on GIN_DATA & - * ~GIN_LEAF page */ + * ~GIN_LEAF page. On GIN_LIST page, number of + * heap tuples. */ uint16 flags; /* see bit definitions below */ } GinPageOpaqueData; typedef GinPageOpaqueData *GinPageOpaque; -#define GIN_ROOT_BLKNO (0) - #define GIN_DATA (1 << 0) #define GIN_LEAF (1 << 1) #define GIN_DELETED (1 << 2) +#define GIN_META (1 << 3) +#define GIN_LIST (1 << 4) +#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */ + +/* Page numbers of fixed-location pages */ +#define GIN_METAPAGE_BLKNO (0) +#define GIN_ROOT_BLKNO (1) + +typedef struct GinMetaPageData +{ + /* + * Pointers to head and tail of pending list, which consists of GIN_LIST + * pages. These store fast-inserted entries that haven't yet been moved + * into the regular GIN structure. + */ + BlockNumber head; + BlockNumber tail; + + /* + * Free space in bytes in the pending list's tail page. + */ + uint32 tailFreeSize; + + /* + * We store both number of pages and number of heap tuples + * that are in the pending list. + */ + BlockNumber nPendingPages; + int64 nPendingHeapTuples; +} GinMetaPageData; + +#define GinPageGetMeta(p) \ + ((GinMetaPageData *) PageGetContents(p)) /* * Works on page @@ -68,6 +93,8 @@ typedef GinPageOpaqueData *GinPageOpaque; #define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF ) #define GinPageIsData(page) ( GinPageGetOpaque(page)->flags & GIN_DATA ) #define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA ) +#define GinPageHasFullRow(page) ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW ) +#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW ) #define GinPageIsDeleted(page) ( GinPageGetOpaque(page)->flags & GIN_DELETED) #define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED) @@ -76,8 +103,8 @@ typedef GinPageOpaqueData *GinPageOpaque; #define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) /* - * Define our ItemPointerGet(BlockNumber|GetOffsetNumber) - * to prevent asserts + * We use our own ItemPointerGet(BlockNumber|GetOffsetNumber) + * to avoid Asserts, since sometimes the ip_posid isn't "valid" */ #define GinItemPointerGetBlockNumber(pointer) \ @@ -86,6 +113,22 @@ typedef GinPageOpaqueData *GinPageOpaque; #define GinItemPointerGetOffsetNumber(pointer) \ ((pointer)->ip_posid) +#define ItemPointerSetMin(p) \ + ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0) +#define ItemPointerIsMin(p) \ + (ItemPointerGetOffsetNumber(p) == (OffsetNumber)0 && \ + ItemPointerGetBlockNumber(p) == (BlockNumber)0) +#define ItemPointerSetMax(p) \ + ItemPointerSet((p), InvalidBlockNumber, (OffsetNumber)0xffff) +#define ItemPointerIsMax(p) \ + (ItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ + ItemPointerGetBlockNumber(p) == InvalidBlockNumber) +#define ItemPointerSetLossyPage(p, b) \ + ItemPointerSet((p), (b), (OffsetNumber)0xffff) +#define ItemPointerIsLossyPage(p) \ + (ItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ + ItemPointerGetBlockNumber(p) != InvalidBlockNumber) + typedef struct { BlockIdData child_blkno; /* use it instead of BlockNumber to save space @@ -135,6 +178,26 @@ typedef struct - GinPageGetOpaque(page)->maxoff * GinSizeOfItem(page) \ - MAXALIGN(sizeof(GinPageOpaqueData))) +/* + * List pages + */ +#define GinListPageSize \ + ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +/* + * Storage type for GIN's reloptions + */ +typedef struct GinOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + bool useFastUpdate; /* use fast updates? */ +} GinOptions; + +#define GIN_DEFAULT_USE_FASTUPDATE true +#define GinGetUseFastUpdate(relation) \ + ((relation)->rd_options ? \ + ((GinOptions *) (relation)->rd_options)->useFastUpdate : GIN_DEFAULT_USE_FASTUPDATE) + #define GIN_UNLOCK BUFFER_LOCK_UNLOCK #define GIN_SHARE BUFFER_LOCK_SHARE @@ -234,14 +297,52 @@ typedef struct ginxlogDeletePage BlockNumber rightLink; } ginxlogDeletePage; +#define XLOG_GIN_UPDATE_META_PAGE 0x60 + +typedef struct ginxlogUpdateMeta +{ + RelFileNode node; + GinMetaPageData metadata; + BlockNumber prevTail; + BlockNumber newRightlink; + int32 ntuples; /* if ntuples > 0 then metadata.tail was updated + * with that many tuples; else new sub list was + * inserted */ + /* array of inserted tuples follows */ +} ginxlogUpdateMeta; + +#define XLOG_GIN_INSERT_LISTPAGE 0x70 + +typedef struct ginxlogInsertListPage +{ + RelFileNode node; + BlockNumber blkno; + BlockNumber rightlink; + int32 ntuples; + /* array of inserted tuples follows */ +} ginxlogInsertListPage; + +#define XLOG_GIN_DELETE_LISTPAGE 0x80 + +#define GIN_NDELETE_AT_ONCE 16 +typedef struct ginxlogDeleteListPages +{ + RelFileNode node; + GinMetaPageData metadata; + int32 ndeleted; + BlockNumber toDelete[GIN_NDELETE_AT_ONCE]; +} ginxlogDeleteListPages; + + /* ginutil.c */ extern Datum ginoptions(PG_FUNCTION_ARGS); extern void initGinState(GinState *state, Relation index); extern Buffer GinNewBuffer(Relation index); extern void GinInitBuffer(Buffer b, uint32 f); extern void GinInitPage(Page page, uint32 f, Size pageSize); +extern void GinInitMetabuffer(Buffer b); extern int compareEntries(GinState *ginstate, OffsetNumber attnum, Datum a, Datum b); -extern int compareAttEntries(GinState *ginstate, OffsetNumber attnum_a, Datum a, +extern int compareAttEntries(GinState *ginstate, OffsetNumber attnum_a, Datum a, OffsetNumber attnum_b, Datum b); extern Datum *extractEntriesS(GinState *ginstate, OffsetNumber attnum, Datum value, int32 *nentries, bool *needUnique); @@ -249,9 +350,14 @@ extern Datum *extractEntriesSU(GinState *ginstate, OffsetNumber attnum, Datum va extern Datum gin_index_getattr(GinState *ginstate, IndexTuple tuple); extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple); + /* gininsert.c */ extern Datum ginbuild(PG_FUNCTION_ARGS); extern Datum gininsert(PG_FUNCTION_ARGS); +extern void ginEntryInsert(Relation index, GinState *ginstate, + OffsetNumber attnum, Datum value, + ItemPointerData *items, uint32 nitem, + bool isBuild); /* ginxlog.c */ extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); @@ -319,7 +425,7 @@ extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack); extern void findParents(GinBtree btree, GinBtreeStack *stack, BlockNumber rootBlkno); /* ginentrypage.c */ -extern IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, +extern IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, ItemPointerData *ipd, uint32 nipd); extern void prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum value, GinState *ginstate); @@ -440,13 +546,7 @@ extern void newScanKey(IndexScanDesc scan); /* ginget.c */ extern PGDLLIMPORT int GinFuzzySearchLimit; -#define ItemPointerSetMax(p) ItemPointerSet( (p), (BlockNumber)0xffffffff, (OffsetNumber)0xffff ) -#define ItemPointerIsMax(p) ( ItemPointerGetBlockNumber(p) == (BlockNumber)0xffffffff && ItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff ) -#define ItemPointerSetMin(p) ItemPointerSet( (p), (BlockNumber)0, (OffsetNumber)0) -#define ItemPointerIsMin(p) ( ItemPointerGetBlockNumber(p) == (BlockNumber)0 && ItemPointerGetOffsetNumber(p) == (OffsetNumber)0 ) - extern Datum gingetbitmap(PG_FUNCTION_ARGS); -extern Datum gingettuple(PG_FUNCTION_ARGS); /* ginvacuum.c */ extern Datum ginbulkdelete(PG_FUNCTION_ARGS); @@ -485,8 +585,26 @@ typedef struct extern void ginInitBA(BuildAccumulator *accum); extern void ginInsertRecordBA(BuildAccumulator *accum, - ItemPointer heapptr, + ItemPointer heapptr, OffsetNumber attnum, Datum *entries, int32 nentry); extern ItemPointerData *ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *entry, uint32 *n); -#endif +/* ginfast.c */ + +typedef struct GinTupleCollector +{ + IndexTuple *tuples; + uint32 ntuples; + uint32 lentuples; + uint32 sumsize; +} GinTupleCollector; + +extern void ginHeapTupleFastInsert(Relation index, GinState *ginstate, + GinTupleCollector *collector); +extern uint32 ginHeapTupleFastCollect(Relation index, GinState *ginstate, + GinTupleCollector *collector, + OffsetNumber attnum, Datum value, ItemPointer item); +extern void ginInsertCleanup(Relation index, GinState *ginstate, + bool vac_delay, IndexBulkDeleteResult *stats); + +#endif /* GIN_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 784ba688919..3d4fdc33bd3 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.524 2009/02/24 10:06:34 petere Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.525 2009/03/24 20:17:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200902242 +#define CATALOG_VERSION_NO 200903241 #endif diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index 7736cb6e58a..a92c1f49971 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.61 2009/03/05 23:06:45 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.62 2009/03/24 20:17:15 tgl Exp $ * * NOTES * the genbki.sh script reads this file and generates .bki @@ -118,7 +118,7 @@ DESCR("hash index access method"); DATA(insert OID = 783 ( gist 0 7 f f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f f t t f f t f 0 gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f f t t f f t f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index b0c5be4323f..2f0dbeb2656 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.537 2009/02/24 10:06:34 petere Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.538 2009/03/24 20:17:15 tgl Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -4184,8 +4184,6 @@ DATA(insert OID = 2592 ( gist_circle_compress PGNSP PGUID 12 1 0 0 f f f t f i DESCR("GiST support"); /* GIN */ -DATA(insert OID = 2730 ( gingettuple PGNSP PGUID 12 1 0 0 f f f t f v 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ gingettuple _null_ _null_ _null_ )); -DESCR("gin(internal)"); DATA(insert OID = 2731 ( gingetbitmap PGNSP PGUID 12 1 0 0 f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_ gingetbitmap _null_ _null_ _null_ )); DESCR("gin(internal)"); DATA(insert OID = 2732 ( gininsert PGNSP PGUID 12 1 0 0 f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ gininsert _null_ _null_ _null_ )); diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 93658543e42..97e1d4c9c40 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -15,7 +15,7 @@ * * Copyright (c) 2003-2009, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.9 2009/01/10 21:08:36 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.10 2009/03/24 20:17:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -52,6 +52,7 @@ extern void tbm_free(TIDBitmap *tbm); extern void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck); +extern void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno); extern void tbm_union(TIDBitmap *a, const TIDBitmap *b); extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b); |