/*------------------------------------------------------------------------- * * ginget.c * fetch tuples from a GIN scan. * * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.27.2.3 2010/08/01 19:16:55 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/gin.h" #include "access/relscan.h" #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "utils/datum.h" #include "utils/memutils.h" typedef struct pendingPosition { Buffer pendingBuffer; OffsetNumber firstOffset; OffsetNumber lastOffset; ItemPointerData item; bool *hasMatchKey; } pendingPosition; /* * Tries to refind previously taken ItemPointer on page. */ static bool findItemInPage(Page page, ItemPointer item, OffsetNumber *off) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; int res; if (GinPageGetOpaque(page)->flags & GIN_DELETED) /* page was deleted by concurrent vacuum */ return false; /* * scan page to find equal or first greater value */ for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) { res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); if (res <= 0) return true; } return false; } /* * Goes to the next page if current offset is outside of bounds */ static bool moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack) { Page page = BufferGetPage(stack->buffer); if (stack->off > PageGetMaxOffsetNumber(page)) { /* * We scanned the whole page, so we should take right page */ stack->blkno = GinPageGetOpaque(page)->rightlink; if (GinPageRightMost(page)) return false; /* no more pages */ LockBuffer(stack->buffer, GIN_UNLOCK); stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); LockBuffer(stack->buffer, GIN_SHARE); stack->off = FirstOffsetNumber; } return true; } /* * Does fullscan of posting tree and saves ItemPointers * in scanEntry->partialMatch TIDBitmap */ static void scanForItems(Relation index, GinScanEntry scanEntry, BlockNumber rootPostingTree) { GinPostingTreeScan *gdi; Buffer buffer; Page page; BlockNumber blkno; gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); buffer = scanBeginPostingTree(gdi); IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ freeGinBtreeStack(gdi->stack); pfree(gdi); /* * Goes through all leaves */ for (;;) { page = BufferGetPage(buffer); if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber) { tbm_add_tuples(scanEntry->partialMatch, (ItemPointer) GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff, false); scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff; } blkno = GinPageGetOpaque(page)->rightlink; if (GinPageRightMost(page)) { UnlockReleaseBuffer(buffer); return; /* no more pages */ } LockBuffer(buffer, GIN_UNLOCK); buffer = ReleaseAndReadBuffer(buffer, index, blkno); LockBuffer(buffer, GIN_SHARE); } } /* * Collects all ItemPointer into the TIDBitmap struct * for entries partially matched to search entry. * * Returns true if done, false if it's needed to restart scan from scratch */ static bool computePartialMatchList(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry scanEntry) { Page page; IndexTuple itup; Datum idatum; int32 cmp; scanEntry->partialMatch = tbm_create(work_mem * 1024L); for (;;) { /* * stack->off points to the interested entry, buffer is already locked */ if (moveRightIfItNeeded(btree, stack) == false) return true; page = BufferGetPage(stack->buffer); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); /* * If tuple stores another attribute then stop scan */ if (gintuple_get_attrnum(btree->ginstate, itup) != scanEntry->attnum) return true; idatum = gin_index_getattr(btree->ginstate, itup); /*---------- * Check of partial match. * case cmp == 0 => match * case cmp > 0 => not match and finish scan * case cmp < 0 => not match and continue scan *---------- */ cmp = DatumGetInt32(FunctionCall4(&btree->ginstate->comparePartialFn[scanEntry->attnum - 1], scanEntry->entry, idatum, UInt16GetDatum(scanEntry->strategy), PointerGetDatum(scanEntry->extra_data))); if (cmp > 0) return true; else if (cmp < 0) { stack->off++; continue; } if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); Datum newDatum, savedDatum = datumCopy( idatum, btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attbyval, btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attlen ); /* * We should unlock current page (but not unpin) during tree scan * to prevent deadlock with vacuum processes. * * We save current entry value (savedDatum) to be able to refind * our tuple after re-locking */ LockBuffer(stack->buffer, GIN_UNLOCK); scanForItems(btree->index, scanEntry, rootPostingTree); /* * We lock again the entry page and while it was unlocked insert * might occured, so we need to refind our position */ LockBuffer(stack->buffer, GIN_SHARE); page = BufferGetPage(stack->buffer); if (!GinPageIsLeaf(page)) { /* * Root page becomes non-leaf while we unlock it. We will * start again, this situation doesn't cause often - root can * became a non-leaf only one per life of index. */ return false; } for (;;) { if (moveRightIfItNeeded(btree, stack) == false) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ page = BufferGetPage(stack->buffer); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); newDatum = gin_index_getattr(btree->ginstate, itup); if (gintuple_get_attrnum(btree->ginstate, itup) != scanEntry->attnum) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ if (compareEntries(btree->ginstate, scanEntry->attnum, newDatum, savedDatum) == 0) { /* Found! */ if (btree->ginstate->origTupdesc->attrs[scanEntry->attnum - 1]->attbyval == false) pfree(DatumGetPointer(savedDatum)); break; } stack->off++; } } else { tbm_add_tuples(scanEntry->partialMatch, GinGetPosting(itup), GinGetNPosting(itup), false); scanEntry->predictNumberResult += GinGetNPosting(itup); } /* * Ok, we save ItemPointers, go to the next entry */ stack->off++; } return true; } /* * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) { GinBtreeData btreeEntry; GinBtreeStack *stackEntry; Page page; bool needUnlock = TRUE; entry->buffer = InvalidBuffer; entry->offset = InvalidOffsetNumber; entry->list = NULL; entry->nlist = 0; entry->partialMatch = NULL; entry->partialMatchResult = NULL; entry->reduceResult = FALSE; entry->predictNumberResult = 0; if (entry->master != NULL) { entry->isFinished = entry->master->isFinished; return; } /* * we should find entry, and begin scan of posting tree or just store * posting list in memory */ prepareEntryScan(&btreeEntry, index, entry->attnum, entry->entry, ginstate); btreeEntry.searchMode = TRUE; stackEntry = ginFindLeafPage(&btreeEntry, NULL); page = BufferGetPage(stackEntry->buffer); entry->isFinished = TRUE; if (entry->isPartialMatch) { /* * btreeEntry.findItem points to the first equal or greater value than * needed. So we will scan further and collect all ItemPointers */ btreeEntry.findItem(&btreeEntry, stackEntry); if (computePartialMatchList(&btreeEntry, stackEntry, entry) == false) { /* * GIN tree was seriously restructured, so we will cleanup all * found data and rescan. See comments near 'return false' in * computePartialMatchList() */ if (entry->partialMatch) { if (entry->partialMatchIterator) tbm_end_iterate(entry->partialMatchIterator); entry->partialMatchIterator = NULL; tbm_free(entry->partialMatch); entry->partialMatch = NULL; } LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); startScanEntry(index, ginstate, entry); return; } if (entry->partialMatch && !tbm_is_empty(entry->partialMatch)) { entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch); entry->isFinished = FALSE; } } else if (btreeEntry.findItem(&btreeEntry, stackEntry)) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); GinPostingTreeScan *gdi; Page page; /* * We should unlock entry page before make deal with posting tree * to prevent deadlocks with vacuum processes. Because entry is * never deleted from page and posting tree is never reduced to * the posting list, we can unlock page after getting BlockNumber * of root of posting tree. */ LockBuffer(stackEntry->buffer, GIN_UNLOCK); needUnlock = FALSE; gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); entry->buffer = scanBeginPostingTree(gdi); /* * We keep buffer pinned because we need to prevent deletion of * page during scan. See GIN's vacuum implementation. RefCount is * increased to keep buffer pinned after freeGinBtreeStack() call. */ IncrBufferRefCount(entry->buffer); page = BufferGetPage(entry->buffer); entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff; /* * Keep page content in memory to prevent durable page locking */ entry->list = (ItemPointerData *) palloc(BLCKSZ); entry->nlist = GinPageGetOpaque(page)->maxoff; memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData)); LockBuffer(entry->buffer, GIN_UNLOCK); freeGinBtreeStack(gdi->stack); pfree(gdi); entry->isFinished = FALSE; } else if (GinGetNPosting(itup) > 0) { entry->nlist = GinGetNPosting(itup); entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist); memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist); entry->isFinished = FALSE; } } if (needUnlock) LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); } static void startScanKey(Relation index, GinState *ginstate, GinScanKey key) { uint32 i; if (!key->firstCall) return; for (i = 0; i < key->nentries; i++) startScanEntry(index, ginstate, key->scanEntry + i); key->isFinished = FALSE; key->firstCall = FALSE; if (GinFuzzySearchLimit > 0) { /* * If all of keys more than threshold we will try to reduce result, we * hope (and only hope, for intersection operation of array our * supposition isn't true), that total result will not more than * minimal predictNumberResult. */ for (i = 0; i < key->nentries; i++) if (key->scanEntry[i].predictNumberResult <= key->nentries * GinFuzzySearchLimit) return; for (i = 0; i < key->nentries; i++) if (key->scanEntry[i].predictNumberResult > key->nentries * GinFuzzySearchLimit) { key->scanEntry[i].predictNumberResult /= key->nentries; key->scanEntry[i].reduceResult = TRUE; } } } static void startScan(IndexScanDesc scan) { uint32 i; GinScanOpaque so = (GinScanOpaque) scan->opaque; for (i = 0; i < so->nkeys; i++) startScanKey(scan->indexRelation, &so->ginstate, so->keys + i); } /* * Gets next ItemPointer from PostingTree. Note, that we copy * page into GinScanEntry->list array and unlock page, but keep it pinned * to prevent interference with vacuum */ static void entryGetNextItem(Relation index, GinScanEntry entry) { Page page; BlockNumber blkno; for (;;) { if (entry->offset < entry->nlist) { entry->curItem = entry->list[entry->offset++]; return; } LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); for (;;) { /* * It's needed to go by right link. During that we should refind * first ItemPointer greater that stored */ blkno = GinPageGetOpaque(page)->rightlink; LockBuffer(entry->buffer, GIN_UNLOCK); if (blkno == InvalidBlockNumber) { ReleaseBuffer(entry->buffer); ItemPointerSetInvalid(&entry->curItem); entry->buffer = InvalidBuffer; entry->isFinished = TRUE; return; } entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno); LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); entry->offset = InvalidOffsetNumber; if (!ItemPointerIsValid(&entry->curItem) || findItemInPage(page, &entry->curItem, &entry->offset)) { /* * Found position equal to or greater than stored */ entry->nlist = GinPageGetOpaque(page)->maxoff; memcpy(entry->list, GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData)); LockBuffer(entry->buffer, GIN_UNLOCK); if (!ItemPointerIsValid(&entry->curItem) || compareItemPointers(&entry->curItem, entry->list + entry->offset - 1) == 0) { /* * First pages are deleted or empty, or we found exact * position, so break inner loop and continue outer one. */ break; } /* * Find greater than entry->curItem position, store it. */ entry->curItem = entry->list[entry->offset - 1]; return; } } } } /* convenience function for invoking a key's consistentFn */ static inline bool callConsistentFn(GinState *ginstate, GinScanKey key) { /* * Initialize recheckCurItem in case the consistentFn doesn't know it * should set it. The safe assumption in that case is to force recheck. */ key->recheckCurItem = true; return DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1], PointerGetDatum(key->entryRes), UInt16GetDatum(key->strategy), key->query, UInt32GetDatum(key->nentries), PointerGetDatum(key->extra_data), PointerGetDatum(&key->recheckCurItem))); } #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* * Sets entry->curItem to next heap item pointer for one entry of one scan key, * or sets entry->isFinished to TRUE if there are no more. * * Item pointers must be returned in ascending order. * * Note: this can return a "lossy page" item pointer, indicating that the * entry potentially matches all items on that heap page. However, it is * not allowed to return both a lossy page pointer and exact (regular) * item pointers for the same page. (Doing so would break the key-combination * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the * current implementation this is guaranteed by the behavior of tidbitmaps. */ static void entryGetItem(Relation index, GinScanEntry entry) { Assert(!entry->isFinished); if (entry->master) { entry->isFinished = entry->master->isFinished; entry->curItem = entry->master->curItem; } else if (entry->partialMatch) { do { if (entry->partialMatchResult == NULL || entry->offset >= entry->partialMatchResult->ntuples) { entry->partialMatchResult = tbm_iterate(entry->partialMatchIterator); if (entry->partialMatchResult == NULL) { ItemPointerSetInvalid(&entry->curItem); tbm_end_iterate(entry->partialMatchIterator); entry->partialMatchIterator = NULL; entry->isFinished = TRUE; break; } /* * reset counter to the beginning of * entry->partialMatchResult. Note: entry->offset is still * greater than partialMatchResult->ntuples if * partialMatchResult is lossy. So, on next call we will get * next result from TIDBitmap. */ entry->offset = 0; } if (entry->partialMatchResult->ntuples < 0) { /* * lossy result, so we need to check the whole page */ ItemPointerSetLossyPage(&entry->curItem, entry->partialMatchResult->blockno); /* * We might as well fall out of the loop; we could not * estimate number of results on this page to support correct * reducing of result even if it's enabled */ break; } ItemPointerSet(&entry->curItem, entry->partialMatchResult->blockno, entry->partialMatchResult->offsets[entry->offset]); entry->offset++; } while (entry->reduceResult == TRUE && dropItem(entry)); } else if (!BufferIsValid(entry->buffer)) { entry->offset++; if (entry->offset <= entry->nlist) entry->curItem = entry->list[entry->offset - 1]; else { ItemPointerSetInvalid(&entry->curItem); entry->isFinished = TRUE; } } else { do { entryGetNextItem(index, entry); } while (entry->isFinished == FALSE && entry->reduceResult == TRUE && dropItem(entry)); } } /* * Sets key->curItem to next heap item pointer for one scan key, advancing * past any item pointers <= advancePast. * Sets key->isFinished to TRUE if there are no more. * * On success, key->recheckCurItem is set true iff recheck is needed for this * item pointer (including the case where the item pointer is a lossy page * pointer). * * Item pointers must be returned in ascending order. * * Note: this can return a "lossy page" item pointer, indicating that the * key potentially matches all items on that heap page. However, it is * not allowed to return both a lossy page pointer and exact (regular) * item pointers for the same page. (Doing so would break the key-combination * logic in scanGetItem.) */ static void keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, GinScanKey key, ItemPointer advancePast) { ItemPointerData myAdvancePast = *advancePast; ItemPointerData curPageLossy; uint32 i; uint32 lossyEntry; bool haveLossyEntry; GinScanEntry entry; bool res; MemoryContext oldCtx; Assert(!key->isFinished); do { /* * Advance any entries that are <= myAdvancePast. In particular, * since entry->curItem was initialized with ItemPointerSetMin, this * ensures we fetch the first item for each entry on the first call. * Then set key->curItem to the minimum of the valid entry curItems. * * Note: a lossy-page entry is encoded by a ItemPointer with max value * for offset (0xffff), so that it will sort after any exact entries * for the same page. So we'll prefer to return exact pointers not * lossy pointers, which is good. Also, when we advance past an exact * entry after processing it, we will not advance past lossy entries * for the same page in other keys, which is NECESSARY for correct * results (since we might have additional entries for the same page * in the first key). */ ItemPointerSetMax(&key->curItem); for (i = 0; i < key->nentries; i++) { entry = key->scanEntry + i; while (entry->isFinished == FALSE && compareItemPointers(&entry->curItem, &myAdvancePast) <= 0) entryGetItem(index, entry); if (entry->isFinished == FALSE && compareItemPointers(&entry->curItem, &key->curItem) < 0) key->curItem = entry->curItem; } if (ItemPointerIsMax(&key->curItem)) { /* all entries are finished */ key->isFinished = TRUE; return; } /* * Now key->curItem contains first ItemPointer after previous result. * Advance myAdvancePast to this value, so that if the consistentFn * rejects the entry and we loop around again, we will advance to the * next available item pointer. */ myAdvancePast = key->curItem; /* * Lossy-page entries pose a problem, since we don't know the correct * entryRes state to pass to the consistentFn, and we also don't know * what its combining logic will be (could be AND, OR, or even NOT). * If the logic is OR then the consistentFn might succeed for all * items in the lossy page even when none of the other entries match. * * If we have a single lossy-page entry then we check to see if the * consistentFn will succeed with only that entry TRUE. If so, * we return a lossy-page pointer to indicate that the whole heap * page must be checked. (On the next call, we'll advance past all * regular and lossy entries for this page before resuming search, * thus ensuring that we never return both regular and lossy pointers * for the same page.) * * This idea could be generalized to more than one lossy-page entry, * but ideally lossy-page entries should be infrequent so it would * seldom be the case that we have more than one at once. So it * doesn't seem worth the extra complexity to optimize that case. * If we do find more than one, we just punt and return a lossy-page * pointer always. * * Note that only lossy-page entries pointing to the current item's * page should trigger this processing; we might have future lossy * pages in the entry array, but they aren't relevant yet. */ ItemPointerSetLossyPage(&curPageLossy, GinItemPointerGetBlockNumber(&key->curItem)); lossyEntry = 0; haveLossyEntry = false; for (i = 0; i < key->nentries; i++) { entry = key->scanEntry + i; if (entry->isFinished == FALSE && compareItemPointers(&entry->curItem, &curPageLossy) == 0) { if (haveLossyEntry) { /* Multiple lossy entries, punt */ key->curItem = curPageLossy; key->recheckCurItem = true; return; } lossyEntry = i; haveLossyEntry = true; } } /* prepare for calling consistentFn in temp context */ oldCtx = MemoryContextSwitchTo(tempCtx); if (haveLossyEntry) { /* Single lossy-page entry, so see if whole page matches */ memset(key->entryRes, FALSE, key->nentries); key->entryRes[lossyEntry] = TRUE; if (callConsistentFn(ginstate, key)) { /* Yes, so clean up ... */ MemoryContextSwitchTo(oldCtx); MemoryContextReset(tempCtx); /* and return lossy pointer for whole page */ key->curItem = curPageLossy; key->recheckCurItem = true; return; } } /* * At this point we know that we don't need to return a lossy * whole-page pointer, but we might have matches for individual exact * item pointers, possibly in combination with a lossy pointer. Our * strategy if there's a lossy pointer is to try the consistentFn both * ways and return a hit if it accepts either one (forcing the hit to * be marked lossy so it will be rechecked). * * Prepare entryRes array to be passed to consistentFn. * * (If key->nentries == 1 then the consistentFn should always succeed, * but we must call it anyway to find out the recheck status.) */ for (i = 0; i < key->nentries; i++) { entry = key->scanEntry + i; if (entry->isFinished == FALSE && compareItemPointers(&entry->curItem, &key->curItem) == 0) key->entryRes[i] = TRUE; else key->entryRes[i] = FALSE; } if (haveLossyEntry) key->entryRes[lossyEntry] = TRUE; res = callConsistentFn(ginstate, key); if (!res && haveLossyEntry) { /* try the other way for the lossy item */ key->entryRes[lossyEntry] = FALSE; res = callConsistentFn(ginstate, key); } /* clean up after consistentFn calls */ MemoryContextSwitchTo(oldCtx); MemoryContextReset(tempCtx); /* If we matched a lossy entry, force recheckCurItem = true */ if (haveLossyEntry) key->recheckCurItem = true; } while (!res); } /* * Get ItemPointer of next heap row to be checked from pending list. * Returns false if there are no more. On pages with several rows * it returns each row separately, on page with part of heap row returns * per page data. pos->firstOffset and pos->lastOffset points * fraction of tuples for current heap row. * * The pendingBuffer is presumed pinned and share-locked on entry, and is * pinned and share-locked on success exit. On failure exit it's released. */ static bool scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) { OffsetNumber maxoff; Page page; IndexTuple itup; ItemPointerSetInvalid(&pos->item); for (;;) { page = BufferGetPage(pos->pendingBuffer); maxoff = PageGetMaxOffsetNumber(page); if (pos->firstOffset > maxoff) { BlockNumber blkno = GinPageGetOpaque(page)->rightlink; if (blkno == InvalidBlockNumber) { UnlockReleaseBuffer(pos->pendingBuffer); pos->pendingBuffer = InvalidBuffer; return false; } else { /* * Here we must prevent deletion of next page by insertcleanup * process, which may be trying to obtain exclusive lock on * current page. So, we lock next page before releasing the * current one */ Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); LockBuffer(tmpbuf, GIN_SHARE); UnlockReleaseBuffer(pos->pendingBuffer); pos->pendingBuffer = tmpbuf; pos->firstOffset = FirstOffsetNumber; } } else { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); pos->item = itup->t_tid; if (GinPageHasFullRow(page)) { /* * find itempointer to the next row */ for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); if (!ItemPointerEquals(&pos->item, &itup->t_tid)) break; } } else { /* * All itempointers are the same on this page */ pos->lastOffset = maxoff + 1; } /* * Now pos->firstOffset points to the first tuple of current heap * row, pos->lastOffset points to the first tuple of second heap * row (or to the end of page) */ break; } } return true; } /* * Scan page from current tuple (off) up till the first of: * - match is found (then returns true) * - no later match is possible * - tuple's attribute number is not equal to entry's attrnum * - reach end of page */ static bool matchPartialInPendingList(GinState *ginstate, Page page, OffsetNumber off, OffsetNumber maxoff, Datum value, OffsetNumber attrnum, Datum *datum, bool *datumExtracted, StrategyNumber strategy, Pointer extra_data) { IndexTuple itup; int32 cmp; while (off < maxoff) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); if (attrnum != gintuple_get_attrnum(ginstate, itup)) return false; if (datumExtracted[off - 1] == false) { datum[off - 1] = gin_index_getattr(ginstate, itup); datumExtracted[off - 1] = true; } /*---------- * Check partial match. * case cmp == 0 => match * case cmp > 0 => not match and end scan (no later match possible) * case cmp < 0 => not match and continue scan *---------- */ cmp = DatumGetInt32(FunctionCall4(&ginstate->comparePartialFn[attrnum - 1], value, datum[off - 1], UInt16GetDatum(strategy), PointerGetDatum(extra_data))); if (cmp == 0) return true; else if (cmp > 0) return false; off++; } return false; } static bool hasAllMatchingKeys(GinScanOpaque so, pendingPosition *pos) { int i; for (i = 0; i < so->nkeys; i++) if (pos->hasMatchKey[i] == false) return false; return true; } /* * Sets entryRes array for each key by looking at * every entry per indexed value (heap's row) in pending list. * returns true if at least one of datum was matched by key's entry * * The pendingBuffer is presumed pinned and share-locked on entry. */ static bool collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) { GinScanOpaque so = (GinScanOpaque) scan->opaque; OffsetNumber attrnum; Page page; IndexTuple itup; int i, j; /* * Reset entryRes */ for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; memset(key->entryRes, FALSE, key->nentries); } memset(pos->hasMatchKey, FALSE, so->nkeys); for (;;) { Datum datum[BLCKSZ / sizeof(IndexTupleData)]; bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)]; Assert(pos->lastOffset > pos->firstOffset); memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset)); page = BufferGetPage(pos->pendingBuffer); for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; for (j = 0; j < key->nentries; j++) { OffsetNumber StopLow = pos->firstOffset, StopHigh = pos->lastOffset, StopMiddle; GinScanEntry entry = key->scanEntry + j; /* already true - do not extra work */ if (key->entryRes[j]) continue; /* * Interested tuples are from pos->firstOffset to * pos->lastOffset and they are ordered by (attnum, Datum) as * it's done in entry tree So we could use binary search to * prevent linear scanning */ while (StopLow < StopHigh) { StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); attrnum = gintuple_get_attrnum(&so->ginstate, itup); if (key->attnum < attrnum) StopHigh = StopMiddle; else if (key->attnum > attrnum) StopLow = StopMiddle + 1; else { int res; if (datumExtracted[StopMiddle - 1] == false) { datum[StopMiddle - 1] = gin_index_getattr(&so->ginstate, itup); datumExtracted[StopMiddle - 1] = true; } res = compareEntries(&so->ginstate, entry->attnum, entry->entry, datum[StopMiddle - 1]); if (res == 0) { /* * The exact match causes, so we just scan from * current position to find a partial match. See * comment above about tuple's ordering. */ if (entry->isPartialMatch) key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopMiddle, pos->lastOffset, entry->entry, entry->attnum, datum, datumExtracted, entry->strategy, entry->extra_data); else key->entryRes[j] = true; break; } else if (res < 0) StopHigh = StopMiddle; else StopLow = StopMiddle + 1; } } if (StopLow >= StopHigh && entry->isPartialMatch) { /* * The exact match wasn't found, so we need to start scan * from first tuple greater then current entry See comment * above about tuple's ordering. */ key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopHigh, pos->lastOffset, entry->entry, entry->attnum, datum, datumExtracted, entry->strategy, entry->extra_data); } pos->hasMatchKey[i] |= key->entryRes[j]; } } pos->firstOffset = pos->lastOffset; if (GinPageHasFullRow(page)) { /* * We scan all values from one tuple, go to next one */ return hasAllMatchingKeys(so, pos); } else { ItemPointerData item = pos->item; /* * need to get next portion of tuples of row containing on several * pages */ if (scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item)) elog(ERROR, "Could not process tuple"); /* XXX should not be * here ! */ } } return hasAllMatchingKeys(so, pos); } /* * Collect all matched rows from pending list in bitmap */ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) { GinScanOpaque so = (GinScanOpaque) scan->opaque; MemoryContext oldCtx; bool recheck, match; int i; pendingPosition pos; Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); BlockNumber blkno; *ntids = 0; LockBuffer(metabuffer, GIN_SHARE); blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head; /* * fetch head of list before unlocking metapage. head page must be pinned * to prevent deletion by vacuum process */ if (blkno == InvalidBlockNumber) { /* No pending list, so proceed with normal scan */ UnlockReleaseBuffer(metabuffer); return; } pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); LockBuffer(pos.pendingBuffer, GIN_SHARE); pos.firstOffset = FirstOffsetNumber; UnlockReleaseBuffer(metabuffer); pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys); /* * loop for each heap row. scanGetCandidate returns full row or row's * tuples from first page. */ while (scanGetCandidate(scan, &pos)) { /* * Check entries in tuple and setup entryRes array If tuples of heap's * row are placed on several pages collectDatumForItem will read all * of that pages. */ if (!collectDatumForItem(scan, &pos)) continue; /* * Matching of entries of one row is finished, so check row using * consistent functions. */ oldCtx = MemoryContextSwitchTo(so->tempCtx); recheck = false; match = true; for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; if (!callConsistentFn(&so->ginstate, key)) { match = false; break; } recheck |= key->recheckCurItem; } MemoryContextSwitchTo(oldCtx); MemoryContextReset(so->tempCtx); if (match) { tbm_add_tuples(tbm, &pos.item, 1, recheck); (*ntids)++; } } pfree(pos.hasMatchKey); } /* * Get next heap item pointer (after advancePast) from scan. * Returns true if anything found. * On success, *item and *recheck are set. * * Note: this is very nearly the same logic as in keyGetItem(), except * that we know the keys are to be combined with AND logic, whereas in * keyGetItem() the combination logic is known only to the consistentFn. */ static bool scanGetItem(IndexScanDesc scan, ItemPointer advancePast, ItemPointerData *item, bool *recheck) { GinScanOpaque so = (GinScanOpaque) scan->opaque; ItemPointerData myAdvancePast = *advancePast; uint32 i; bool match; for (;;) { /* * Advance any keys that are <= myAdvancePast. In particular, * since key->curItem was initialized with ItemPointerSetMin, this * ensures we fetch the first item for each key on the first call. * Then set *item to the minimum of the key curItems. * * Note: a lossy-page entry is encoded by a ItemPointer with max value * for offset (0xffff), so that it will sort after any exact entries * for the same page. So we'll prefer to return exact pointers not * lossy pointers, which is good. Also, when we advance past an exact * entry after processing it, we will not advance past lossy entries * for the same page in other keys, which is NECESSARY for correct * results (since we might have additional entries for the same page * in the first key). */ ItemPointerSetMax(item); for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; while (key->isFinished == FALSE && compareItemPointers(&key->curItem, &myAdvancePast) <= 0) keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, key, &myAdvancePast); if (key->isFinished) return FALSE; /* finished one of keys */ if (compareItemPointers(&key->curItem, item) < 0) *item = key->curItem; } Assert(!ItemPointerIsMax(item)); /*---------- * Now *item contains first ItemPointer after previous result. * * The item is a valid hit only if all the keys returned either * that exact TID, or a lossy reference to the same page. * * This logic works only if a keyGetItem stream can never contain both * exact and lossy pointers for the same page. Else we could have a * case like * * stream 1 stream 2 * ... ... * 42/6 42/7 * 50/1 42/0xffff * ... ... * * We would conclude that 42/6 is not a match and advance stream 1, * thus never detecting the match to the lossy pointer in stream 2. * (keyGetItem has a similar problem versus entryGetItem.) *---------- */ match = true; for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; if (compareItemPointers(item, &key->curItem) == 0) continue; if (ItemPointerIsLossyPage(&key->curItem) && GinItemPointerGetBlockNumber(&key->curItem) == GinItemPointerGetBlockNumber(item)) continue; match = false; break; } if (match) break; /* * No hit. Update myAdvancePast to this TID, so that on the next * pass we'll move to the next possible entry. */ myAdvancePast = *item; } /* * We must return recheck = true if any of the keys are marked recheck. */ *recheck = false; for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; if (key->recheckCurItem) { *recheck = true; break; } } return TRUE; } #define GinIsNewKey(s) ( ((GinScanOpaque) scan->opaque)->keys == NULL ) #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes == true ) Datum gingetbitmap(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); int64 ntids; ItemPointerData iptr; bool recheck; if (GinIsNewKey(scan)) newScanKey(scan); if (GinIsVoidRes(scan)) PG_RETURN_INT64(0); ntids = 0; /* * First, scan the pending list and collect any matching entries into the * bitmap. After we scan a pending item, some other backend could post it * into the main index, and so we might visit it a second time during the * main scan. This is okay because we'll just re-set the same bit in the * bitmap. (The possibility of duplicate visits is a major reason why GIN * can't support the amgettuple API, however.) Note that it would not do * to scan the main index before the pending list, since concurrent * cleanup could then make us miss entries entirely. */ scanPendingInsert(scan, tbm, &ntids); /* * Now scan the main index. */ startScan(scan); ItemPointerSetMin(&iptr); for (;;) { CHECK_FOR_INTERRUPTS(); if (!scanGetItem(scan, &iptr, &iptr, &recheck)) break; if (ItemPointerIsLossyPage(&iptr)) tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); else tbm_add_tuples(tbm, &iptr, 1, recheck); ntids++; } PG_RETURN_INT64(ntids); }