mirror of
https://github.com/postgres/postgres.git
synced 2025-05-03 22:24:49 +03:00
Reported-by: Michael Paquier Discussion: https://postgr.es/m/ZZKTDPxBBMt3C0J9@paquier.xyz Backpatch-through: 12
1968 lines
54 KiB
C
1968 lines
54 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* ginget.c
|
|
* fetch tuples from a GIN scan.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/access/gin/ginget.c
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "access/gin_private.h"
|
|
#include "access/relscan.h"
|
|
#include "common/pg_prng.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/predicate.h"
|
|
#include "utils/datum.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/rel.h"
|
|
|
|
/* GUC parameter */
|
|
int GinFuzzySearchLimit = 0;
|
|
|
|
typedef struct pendingPosition
|
|
{
|
|
Buffer pendingBuffer;
|
|
OffsetNumber firstOffset;
|
|
OffsetNumber lastOffset;
|
|
ItemPointerData item;
|
|
bool *hasMatchKey;
|
|
} pendingPosition;
|
|
|
|
|
|
/*
|
|
* Goes to the next page if current offset is outside of bounds
|
|
*/
|
|
static bool
|
|
moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
|
|
{
|
|
Page page = BufferGetPage(stack->buffer);
|
|
|
|
if (stack->off > PageGetMaxOffsetNumber(page))
|
|
{
|
|
/*
|
|
* We scanned the whole page, so we should take right page
|
|
*/
|
|
if (GinPageRightMost(page))
|
|
return false; /* no more pages */
|
|
|
|
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
|
|
stack->blkno = BufferGetBlockNumber(stack->buffer);
|
|
stack->off = FirstOffsetNumber;
|
|
PredicateLockPage(btree->index, stack->blkno, snapshot);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Scan all pages of a posting tree and save all its heap ItemPointers
|
|
* in scanEntry->matchBitmap
|
|
*/
|
|
static void
|
|
scanPostingTree(Relation index, GinScanEntry scanEntry,
|
|
BlockNumber rootPostingTree)
|
|
{
|
|
GinBtreeData btree;
|
|
GinBtreeStack *stack;
|
|
Buffer buffer;
|
|
Page page;
|
|
|
|
/* Descend to the leftmost leaf page */
|
|
stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
|
|
buffer = stack->buffer;
|
|
|
|
IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
|
|
|
|
freeGinBtreeStack(stack);
|
|
|
|
/*
|
|
* Loop iterates through all leaf pages of posting tree
|
|
*/
|
|
for (;;)
|
|
{
|
|
page = BufferGetPage(buffer);
|
|
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
|
|
{
|
|
int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
|
|
|
|
scanEntry->predictNumberResult += n;
|
|
}
|
|
|
|
if (GinPageRightMost(page))
|
|
break; /* no more pages */
|
|
|
|
buffer = ginStepRight(buffer, index, GIN_SHARE);
|
|
}
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
}
|
|
|
|
/*
|
|
* Collects TIDs into scanEntry->matchBitmap for all heap tuples that
|
|
* match the search entry. This supports three different match modes:
|
|
*
|
|
* 1. Partial-match support: scan from current point until the
|
|
* comparePartialFn says we're done.
|
|
* 2. SEARCH_MODE_ALL: scan from current point (which should be first
|
|
* key for the current attnum) until we hit null items or end of attnum
|
|
* 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
|
|
* key for the current attnum) until we hit end of attnum
|
|
*
|
|
* Returns true if done, false if it's necessary to restart scan from scratch
|
|
*/
|
|
static bool
|
|
collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
|
|
GinScanEntry scanEntry, Snapshot snapshot)
|
|
{
|
|
OffsetNumber attnum;
|
|
Form_pg_attribute attr;
|
|
|
|
/* Initialize empty bitmap result */
|
|
scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
|
|
|
|
/* Null query cannot partial-match anything */
|
|
if (scanEntry->isPartialMatch &&
|
|
scanEntry->queryCategory != GIN_CAT_NORM_KEY)
|
|
return true;
|
|
|
|
/* Locate tupdesc entry for key column (for attbyval/attlen data) */
|
|
attnum = scanEntry->attnum;
|
|
attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
|
|
|
|
/*
|
|
* Predicate lock entry leaf page, following pages will be locked by
|
|
* moveRightIfItNeeded()
|
|
*/
|
|
PredicateLockPage(btree->index,
|
|
BufferGetBlockNumber(stack->buffer),
|
|
snapshot);
|
|
|
|
for (;;)
|
|
{
|
|
Page page;
|
|
IndexTuple itup;
|
|
Datum idatum;
|
|
GinNullCategory icategory;
|
|
|
|
/*
|
|
* stack->off points to the interested entry, buffer is already locked
|
|
*/
|
|
if (moveRightIfItNeeded(btree, stack, snapshot) == false)
|
|
return true;
|
|
|
|
page = BufferGetPage(stack->buffer);
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
|
|
|
|
/*
|
|
* If tuple stores another attribute then stop scan
|
|
*/
|
|
if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
|
|
return true;
|
|
|
|
/* Safe to fetch attribute value */
|
|
idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
|
|
|
|
/*
|
|
* Check for appropriate scan stop conditions
|
|
*/
|
|
if (scanEntry->isPartialMatch)
|
|
{
|
|
int32 cmp;
|
|
|
|
/*
|
|
* In partial match, stop scan at any null (including
|
|
* placeholders); partial matches never match nulls
|
|
*/
|
|
if (icategory != GIN_CAT_NORM_KEY)
|
|
return true;
|
|
|
|
/*----------
|
|
* Check of partial match.
|
|
* case cmp == 0 => match
|
|
* case cmp > 0 => not match and finish scan
|
|
* case cmp < 0 => not match and continue scan
|
|
*----------
|
|
*/
|
|
cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
|
|
btree->ginstate->supportCollation[attnum - 1],
|
|
scanEntry->queryKey,
|
|
idatum,
|
|
UInt16GetDatum(scanEntry->strategy),
|
|
PointerGetDatum(scanEntry->extra_data)));
|
|
|
|
if (cmp > 0)
|
|
return true;
|
|
else if (cmp < 0)
|
|
{
|
|
stack->off++;
|
|
continue;
|
|
}
|
|
}
|
|
else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
|
|
{
|
|
/*
|
|
* In ALL mode, we are not interested in null items, so we can
|
|
* stop if we get to a null-item placeholder (which will be the
|
|
* last entry for a given attnum). We do want to include NULL_KEY
|
|
* and EMPTY_ITEM entries, though.
|
|
*/
|
|
if (icategory == GIN_CAT_NULL_ITEM)
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* OK, we want to return the TIDs listed in this entry.
|
|
*/
|
|
if (GinIsPostingTree(itup))
|
|
{
|
|
BlockNumber rootPostingTree = GinGetPostingTree(itup);
|
|
|
|
/*
|
|
* We should unlock current page (but not unpin) during tree scan
|
|
* to prevent deadlock with vacuum processes.
|
|
*
|
|
* We save current entry value (idatum) to be able to re-find our
|
|
* tuple after re-locking
|
|
*/
|
|
if (icategory == GIN_CAT_NORM_KEY)
|
|
idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
|
|
|
|
LockBuffer(stack->buffer, GIN_UNLOCK);
|
|
|
|
/*
|
|
* Acquire predicate lock on the posting tree. We already hold a
|
|
* lock on the entry page, but insertions to the posting tree
|
|
* don't check for conflicts on that level.
|
|
*/
|
|
PredicateLockPage(btree->index, rootPostingTree, snapshot);
|
|
|
|
/* Collect all the TIDs in this entry's posting tree */
|
|
scanPostingTree(btree->index, scanEntry, rootPostingTree);
|
|
|
|
/*
|
|
* We lock again the entry page and while it was unlocked insert
|
|
* might have occurred, so we need to re-find our position.
|
|
*/
|
|
LockBuffer(stack->buffer, GIN_SHARE);
|
|
page = BufferGetPage(stack->buffer);
|
|
if (!GinPageIsLeaf(page))
|
|
{
|
|
/*
|
|
* Root page becomes non-leaf while we unlock it. We will
|
|
* start again, this situation doesn't occur often - root can
|
|
* became a non-leaf only once per life of index.
|
|
*/
|
|
return false;
|
|
}
|
|
|
|
/* Search forward to re-find idatum */
|
|
for (;;)
|
|
{
|
|
if (moveRightIfItNeeded(btree, stack, snapshot) == false)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INTERNAL_ERROR),
|
|
errmsg("failed to re-find tuple within index \"%s\"",
|
|
RelationGetRelationName(btree->index))));
|
|
|
|
page = BufferGetPage(stack->buffer);
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
|
|
|
|
if (gintuple_get_attrnum(btree->ginstate, itup) == attnum)
|
|
{
|
|
Datum newDatum;
|
|
GinNullCategory newCategory;
|
|
|
|
newDatum = gintuple_get_key(btree->ginstate, itup,
|
|
&newCategory);
|
|
|
|
if (ginCompareEntries(btree->ginstate, attnum,
|
|
newDatum, newCategory,
|
|
idatum, icategory) == 0)
|
|
break; /* Found! */
|
|
}
|
|
|
|
stack->off++;
|
|
}
|
|
|
|
if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
|
|
pfree(DatumGetPointer(idatum));
|
|
}
|
|
else
|
|
{
|
|
ItemPointer ipd;
|
|
int nipd;
|
|
|
|
ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
|
|
tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
|
|
scanEntry->predictNumberResult += GinGetNPosting(itup);
|
|
pfree(ipd);
|
|
}
|
|
|
|
/*
|
|
* Done with this entry, go to the next
|
|
*/
|
|
stack->off++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Start* functions setup beginning state of searches: finds correct buffer and pins it.
|
|
*/
|
|
static void
|
|
startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
|
|
{
|
|
GinBtreeData btreeEntry;
|
|
GinBtreeStack *stackEntry;
|
|
Page page;
|
|
bool needUnlock;
|
|
|
|
restartScanEntry:
|
|
entry->buffer = InvalidBuffer;
|
|
ItemPointerSetMin(&entry->curItem);
|
|
entry->offset = InvalidOffsetNumber;
|
|
if (entry->list)
|
|
pfree(entry->list);
|
|
entry->list = NULL;
|
|
entry->nlist = 0;
|
|
entry->matchBitmap = NULL;
|
|
entry->matchResult = NULL;
|
|
entry->reduceResult = false;
|
|
entry->predictNumberResult = 0;
|
|
|
|
/*
|
|
* we should find entry, and begin scan of posting tree or just store
|
|
* posting list in memory
|
|
*/
|
|
ginPrepareEntryScan(&btreeEntry, entry->attnum,
|
|
entry->queryKey, entry->queryCategory,
|
|
ginstate);
|
|
stackEntry = ginFindLeafPage(&btreeEntry, true, false);
|
|
page = BufferGetPage(stackEntry->buffer);
|
|
|
|
/* ginFindLeafPage() will have already checked snapshot age. */
|
|
needUnlock = true;
|
|
|
|
entry->isFinished = true;
|
|
|
|
if (entry->isPartialMatch ||
|
|
entry->queryCategory == GIN_CAT_EMPTY_QUERY)
|
|
{
|
|
/*
|
|
* btreeEntry.findItem locates the first item >= given search key.
|
|
* (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
|
|
* because of the way the GIN_CAT_EMPTY_QUERY category code is
|
|
* assigned.) We scan forward from there and collect all TIDs needed
|
|
* for the entry type.
|
|
*/
|
|
btreeEntry.findItem(&btreeEntry, stackEntry);
|
|
if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
|
|
== false)
|
|
{
|
|
/*
|
|
* GIN tree was seriously restructured, so we will cleanup all
|
|
* found data and rescan. See comments near 'return false' in
|
|
* collectMatchBitmap()
|
|
*/
|
|
if (entry->matchBitmap)
|
|
{
|
|
if (entry->matchIterator)
|
|
tbm_end_iterate(entry->matchIterator);
|
|
entry->matchIterator = NULL;
|
|
tbm_free(entry->matchBitmap);
|
|
entry->matchBitmap = NULL;
|
|
}
|
|
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
|
|
freeGinBtreeStack(stackEntry);
|
|
goto restartScanEntry;
|
|
}
|
|
|
|
if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
|
|
{
|
|
entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
|
|
entry->isFinished = false;
|
|
}
|
|
}
|
|
else if (btreeEntry.findItem(&btreeEntry, stackEntry))
|
|
{
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
|
|
|
|
if (GinIsPostingTree(itup))
|
|
{
|
|
BlockNumber rootPostingTree = GinGetPostingTree(itup);
|
|
GinBtreeStack *stack;
|
|
Page entrypage;
|
|
ItemPointerData minItem;
|
|
|
|
/*
|
|
* This is an equality scan, so lock the root of the posting tree.
|
|
* It represents a lock on the exact key value, and covers all the
|
|
* items in the posting tree.
|
|
*/
|
|
PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
|
|
|
|
/*
|
|
* We should unlock entry page before touching posting tree to
|
|
* prevent deadlocks with vacuum processes. Because entry is never
|
|
* deleted from page and posting tree is never reduced to the
|
|
* posting list, we can unlock page after getting BlockNumber of
|
|
* root of posting tree.
|
|
*/
|
|
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
|
|
needUnlock = false;
|
|
|
|
stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
|
|
rootPostingTree);
|
|
entry->buffer = stack->buffer;
|
|
|
|
/*
|
|
* We keep buffer pinned because we need to prevent deletion of
|
|
* page during scan. See GIN's vacuum implementation. RefCount is
|
|
* increased to keep buffer pinned after freeGinBtreeStack() call.
|
|
*/
|
|
IncrBufferRefCount(entry->buffer);
|
|
|
|
entrypage = BufferGetPage(entry->buffer);
|
|
|
|
/*
|
|
* Load the first page into memory.
|
|
*/
|
|
ItemPointerSetMin(&minItem);
|
|
entry->list = GinDataLeafPageGetItems(entrypage, &entry->nlist, minItem);
|
|
|
|
entry->predictNumberResult = stack->predictNumber * entry->nlist;
|
|
|
|
LockBuffer(entry->buffer, GIN_UNLOCK);
|
|
freeGinBtreeStack(stack);
|
|
entry->isFinished = false;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Lock the entry leaf page. This is more coarse-grained than
|
|
* necessary, because it will conflict with any insertions that
|
|
* land on the same leaf page, not only the exact key we searched
|
|
* for. But locking an individual tuple would require updating
|
|
* that lock whenever it moves because of insertions or vacuums,
|
|
* which seems too complicated.
|
|
*/
|
|
PredicateLockPage(ginstate->index,
|
|
BufferGetBlockNumber(stackEntry->buffer),
|
|
snapshot);
|
|
if (GinGetNPosting(itup) > 0)
|
|
{
|
|
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
|
|
&entry->nlist);
|
|
entry->predictNumberResult = entry->nlist;
|
|
|
|
entry->isFinished = false;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* No entry found. Predicate lock the leaf page, to lock the place
|
|
* where the entry would've been, had there been one.
|
|
*/
|
|
PredicateLockPage(ginstate->index,
|
|
BufferGetBlockNumber(stackEntry->buffer), snapshot);
|
|
}
|
|
|
|
if (needUnlock)
|
|
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
|
|
freeGinBtreeStack(stackEntry);
|
|
}
|
|
|
|
/*
|
|
* Comparison function for scan entry indexes. Sorts by predictNumberResult,
|
|
* least frequent items first.
|
|
*/
|
|
static int
|
|
entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
|
|
{
|
|
const GinScanKey key = (const GinScanKey) arg;
|
|
int i1 = *(const int *) a1;
|
|
int i2 = *(const int *) a2;
|
|
uint32 n1 = key->scanEntry[i1]->predictNumberResult;
|
|
uint32 n2 = key->scanEntry[i2]->predictNumberResult;
|
|
|
|
if (n1 < n2)
|
|
return -1;
|
|
else if (n1 == n2)
|
|
return 0;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
static void
|
|
startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
|
|
{
|
|
MemoryContext oldCtx = CurrentMemoryContext;
|
|
int i;
|
|
int j;
|
|
int *entryIndexes;
|
|
|
|
ItemPointerSetMin(&key->curItem);
|
|
key->curItemMatches = false;
|
|
key->recheckCurItem = false;
|
|
key->isFinished = false;
|
|
|
|
/*
|
|
* Divide the entries into two distinct sets: required and additional.
|
|
* Additional entries are not enough for a match alone, without any items
|
|
* from the required set, but are needed by the consistent function to
|
|
* decide if an item matches. When scanning, we can skip over items from
|
|
* additional entries that have no corresponding matches in any of the
|
|
* required entries. That speeds up queries like "frequent & rare"
|
|
* considerably, if the frequent term can be put in the additional set.
|
|
*
|
|
* There can be many legal ways to divide them entries into these two
|
|
* sets. A conservative division is to just put everything in the required
|
|
* set, but the more you can put in the additional set, the more you can
|
|
* skip during the scan. To maximize skipping, we try to put as many
|
|
* frequent items as possible into additional, and less frequent ones into
|
|
* required. To do that, sort the entries by frequency
|
|
* (predictNumberResult), and put entries into the required set in that
|
|
* order, until the consistent function says that none of the remaining
|
|
* entries can form a match, without any items from the required set. The
|
|
* rest go to the additional set.
|
|
*
|
|
* Exclude-only scan keys are known to have no required entries.
|
|
*/
|
|
if (key->excludeOnly)
|
|
{
|
|
MemoryContextSwitchTo(so->keyCtx);
|
|
|
|
key->nrequired = 0;
|
|
key->nadditional = key->nentries;
|
|
key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
|
|
for (i = 0; i < key->nadditional; i++)
|
|
key->additionalEntries[i] = key->scanEntry[i];
|
|
}
|
|
else if (key->nentries > 1)
|
|
{
|
|
MemoryContextSwitchTo(so->tempCtx);
|
|
|
|
entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
|
|
for (i = 0; i < key->nentries; i++)
|
|
entryIndexes[i] = i;
|
|
qsort_arg(entryIndexes, key->nentries, sizeof(int),
|
|
entryIndexByFrequencyCmp, key);
|
|
|
|
for (i = 0; i < key->nentries - 1; i++)
|
|
{
|
|
/* Pass all entries <= i as FALSE, and the rest as MAYBE */
|
|
for (j = 0; j <= i; j++)
|
|
key->entryRes[entryIndexes[j]] = GIN_FALSE;
|
|
for (j = i + 1; j < key->nentries; j++)
|
|
key->entryRes[entryIndexes[j]] = GIN_MAYBE;
|
|
|
|
if (key->triConsistentFn(key) == GIN_FALSE)
|
|
break;
|
|
}
|
|
/* i is now the last required entry. */
|
|
|
|
MemoryContextSwitchTo(so->keyCtx);
|
|
|
|
key->nrequired = i + 1;
|
|
key->nadditional = key->nentries - key->nrequired;
|
|
key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
|
|
key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
|
|
|
|
j = 0;
|
|
for (i = 0; i < key->nrequired; i++)
|
|
key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
|
|
for (i = 0; i < key->nadditional; i++)
|
|
key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
|
|
|
|
/* clean up after consistentFn calls (also frees entryIndexes) */
|
|
MemoryContextReset(so->tempCtx);
|
|
}
|
|
else
|
|
{
|
|
MemoryContextSwitchTo(so->keyCtx);
|
|
|
|
key->nrequired = 1;
|
|
key->nadditional = 0;
|
|
key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
|
|
key->requiredEntries[0] = key->scanEntry[0];
|
|
}
|
|
MemoryContextSwitchTo(oldCtx);
|
|
}
|
|
|
|
static void
|
|
startScan(IndexScanDesc scan)
|
|
{
|
|
GinScanOpaque so = (GinScanOpaque) scan->opaque;
|
|
GinState *ginstate = &so->ginstate;
|
|
uint32 i;
|
|
|
|
for (i = 0; i < so->totalentries; i++)
|
|
startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
|
|
|
|
if (GinFuzzySearchLimit > 0)
|
|
{
|
|
/*
|
|
* If all of keys more than threshold we will try to reduce result, we
|
|
* hope (and only hope, for intersection operation of array our
|
|
* supposition isn't true), that total result will not more than
|
|
* minimal predictNumberResult.
|
|
*/
|
|
bool reduce = true;
|
|
|
|
for (i = 0; i < so->totalentries; i++)
|
|
{
|
|
if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
|
|
{
|
|
reduce = false;
|
|
break;
|
|
}
|
|
}
|
|
if (reduce)
|
|
{
|
|
for (i = 0; i < so->totalentries; i++)
|
|
{
|
|
so->entries[i]->predictNumberResult /= so->totalentries;
|
|
so->entries[i]->reduceResult = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Now that we have the estimates for the entry frequencies, finish
|
|
* initializing the scan keys.
|
|
*/
|
|
for (i = 0; i < so->nkeys; i++)
|
|
startScanKey(ginstate, so, so->keys + i);
|
|
}
|
|
|
|
/*
|
|
* Load the next batch of item pointers from a posting tree.
|
|
*
|
|
* Note that we copy the page into GinScanEntry->list array and unlock it, but
|
|
* keep it pinned to prevent interference with vacuum.
|
|
*/
|
|
static void
|
|
entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
|
|
ItemPointerData advancePast)
|
|
{
|
|
Page page;
|
|
int i;
|
|
bool stepright;
|
|
|
|
if (!BufferIsValid(entry->buffer))
|
|
{
|
|
entry->isFinished = true;
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* We have two strategies for finding the correct page: step right from
|
|
* the current page, or descend the tree again from the root. If
|
|
* advancePast equals the current item, the next matching item should be
|
|
* on the next page, so we step right. Otherwise, descend from root.
|
|
*/
|
|
if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
|
|
{
|
|
stepright = true;
|
|
LockBuffer(entry->buffer, GIN_SHARE);
|
|
}
|
|
else
|
|
{
|
|
GinBtreeStack *stack;
|
|
|
|
ReleaseBuffer(entry->buffer);
|
|
|
|
/*
|
|
* Set the search key, and find the correct leaf page.
|
|
*/
|
|
if (ItemPointerIsLossyPage(&advancePast))
|
|
{
|
|
ItemPointerSet(&entry->btree.itemptr,
|
|
GinItemPointerGetBlockNumber(&advancePast) + 1,
|
|
FirstOffsetNumber);
|
|
}
|
|
else
|
|
{
|
|
ItemPointerSet(&entry->btree.itemptr,
|
|
GinItemPointerGetBlockNumber(&advancePast),
|
|
OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
|
|
}
|
|
entry->btree.fullScan = false;
|
|
stack = ginFindLeafPage(&entry->btree, true, false);
|
|
|
|
/* we don't need the stack, just the buffer. */
|
|
entry->buffer = stack->buffer;
|
|
IncrBufferRefCount(entry->buffer);
|
|
freeGinBtreeStack(stack);
|
|
stepright = false;
|
|
}
|
|
|
|
elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
|
|
GinItemPointerGetBlockNumber(&advancePast),
|
|
GinItemPointerGetOffsetNumber(&advancePast),
|
|
!stepright);
|
|
|
|
page = BufferGetPage(entry->buffer);
|
|
for (;;)
|
|
{
|
|
entry->offset = InvalidOffsetNumber;
|
|
if (entry->list)
|
|
{
|
|
pfree(entry->list);
|
|
entry->list = NULL;
|
|
entry->nlist = 0;
|
|
}
|
|
|
|
if (stepright)
|
|
{
|
|
/*
|
|
* We've processed all the entries on this page. If it was the
|
|
* last page in the tree, we're done.
|
|
*/
|
|
if (GinPageRightMost(page))
|
|
{
|
|
UnlockReleaseBuffer(entry->buffer);
|
|
entry->buffer = InvalidBuffer;
|
|
entry->isFinished = true;
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Step to next page, following the right link. then find the
|
|
* first ItemPointer greater than advancePast.
|
|
*/
|
|
entry->buffer = ginStepRight(entry->buffer,
|
|
ginstate->index,
|
|
GIN_SHARE);
|
|
page = BufferGetPage(entry->buffer);
|
|
}
|
|
stepright = true;
|
|
|
|
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
|
|
continue; /* page was deleted by concurrent vacuum */
|
|
|
|
/*
|
|
* The first item > advancePast might not be on this page, but
|
|
* somewhere to the right, if the page was split, or a non-match from
|
|
* another key in the query allowed us to skip some items from this
|
|
* entry. Keep following the right-links until we re-find the correct
|
|
* page.
|
|
*/
|
|
if (!GinPageRightMost(page) &&
|
|
ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
|
|
{
|
|
/*
|
|
* the item we're looking is > the right bound of the page, so it
|
|
* can't be on this page.
|
|
*/
|
|
continue;
|
|
}
|
|
|
|
entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
|
|
|
|
for (i = 0; i < entry->nlist; i++)
|
|
{
|
|
if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
|
|
{
|
|
entry->offset = i;
|
|
|
|
if (GinPageRightMost(page))
|
|
{
|
|
/* after processing the copied items, we're done. */
|
|
UnlockReleaseBuffer(entry->buffer);
|
|
entry->buffer = InvalidBuffer;
|
|
}
|
|
else
|
|
LockBuffer(entry->buffer, GIN_UNLOCK);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#define gin_rand() pg_prng_double(&pg_global_prng_state)
|
|
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
|
|
|
|
/*
|
|
* Sets entry->curItem to next heap item pointer > advancePast, for one entry
|
|
* of one scan key, or sets entry->isFinished to true if there are no more.
|
|
*
|
|
* Item pointers are returned in ascending order.
|
|
*
|
|
* Note: this can return a "lossy page" item pointer, indicating that the
|
|
* entry potentially matches all items on that heap page. However, it is
|
|
* not allowed to return both a lossy page pointer and exact (regular)
|
|
* item pointers for the same page. (Doing so would break the key-combination
|
|
* logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
|
|
* current implementation this is guaranteed by the behavior of tidbitmaps.
|
|
*/
|
|
static void
|
|
entryGetItem(GinState *ginstate, GinScanEntry entry,
|
|
ItemPointerData advancePast)
|
|
{
|
|
Assert(!entry->isFinished);
|
|
|
|
Assert(!ItemPointerIsValid(&entry->curItem) ||
|
|
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
|
|
|
|
if (entry->matchBitmap)
|
|
{
|
|
/* A bitmap result */
|
|
BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
|
|
OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
|
|
|
|
for (;;)
|
|
{
|
|
/*
|
|
* If we've exhausted all items on this block, move to next block
|
|
* in the bitmap.
|
|
*/
|
|
while (entry->matchResult == NULL ||
|
|
(entry->matchResult->ntuples >= 0 &&
|
|
entry->offset >= entry->matchResult->ntuples) ||
|
|
entry->matchResult->blockno < advancePastBlk ||
|
|
(ItemPointerIsLossyPage(&advancePast) &&
|
|
entry->matchResult->blockno == advancePastBlk))
|
|
{
|
|
entry->matchResult = tbm_iterate(entry->matchIterator);
|
|
|
|
if (entry->matchResult == NULL)
|
|
{
|
|
ItemPointerSetInvalid(&entry->curItem);
|
|
tbm_end_iterate(entry->matchIterator);
|
|
entry->matchIterator = NULL;
|
|
entry->isFinished = true;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Reset counter to the beginning of entry->matchResult. Note:
|
|
* entry->offset is still greater than matchResult->ntuples if
|
|
* matchResult is lossy. So, on next call we will get next
|
|
* result from TIDBitmap.
|
|
*/
|
|
entry->offset = 0;
|
|
}
|
|
if (entry->isFinished)
|
|
break;
|
|
|
|
/*
|
|
* We're now on the first page after advancePast which has any
|
|
* items on it. If it's a lossy result, return that.
|
|
*/
|
|
if (entry->matchResult->ntuples < 0)
|
|
{
|
|
ItemPointerSetLossyPage(&entry->curItem,
|
|
entry->matchResult->blockno);
|
|
|
|
/*
|
|
* We might as well fall out of the loop; we could not
|
|
* estimate number of results on this page to support correct
|
|
* reducing of result even if it's enabled.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Not a lossy page. Skip over any offsets <= advancePast, and
|
|
* return that.
|
|
*/
|
|
if (entry->matchResult->blockno == advancePastBlk)
|
|
{
|
|
/*
|
|
* First, do a quick check against the last offset on the
|
|
* page. If that's > advancePast, so are all the other
|
|
* offsets, so just go back to the top to get the next page.
|
|
*/
|
|
if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
|
|
{
|
|
entry->offset = entry->matchResult->ntuples;
|
|
continue;
|
|
}
|
|
|
|
/* Otherwise scan to find the first item > advancePast */
|
|
while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
|
|
entry->offset++;
|
|
}
|
|
|
|
ItemPointerSet(&entry->curItem,
|
|
entry->matchResult->blockno,
|
|
entry->matchResult->offsets[entry->offset]);
|
|
entry->offset++;
|
|
|
|
/* Done unless we need to reduce the result */
|
|
if (!entry->reduceResult || !dropItem(entry))
|
|
break;
|
|
}
|
|
}
|
|
else if (!BufferIsValid(entry->buffer))
|
|
{
|
|
/*
|
|
* A posting list from an entry tuple, or the last page of a posting
|
|
* tree.
|
|
*/
|
|
for (;;)
|
|
{
|
|
if (entry->offset >= entry->nlist)
|
|
{
|
|
ItemPointerSetInvalid(&entry->curItem);
|
|
entry->isFinished = true;
|
|
break;
|
|
}
|
|
|
|
entry->curItem = entry->list[entry->offset++];
|
|
|
|
/* If we're not past advancePast, keep scanning */
|
|
if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
|
|
continue;
|
|
|
|
/* Done unless we need to reduce the result */
|
|
if (!entry->reduceResult || !dropItem(entry))
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* A posting tree */
|
|
for (;;)
|
|
{
|
|
/* If we've processed the current batch, load more items */
|
|
while (entry->offset >= entry->nlist)
|
|
{
|
|
entryLoadMoreItems(ginstate, entry, advancePast);
|
|
|
|
if (entry->isFinished)
|
|
{
|
|
ItemPointerSetInvalid(&entry->curItem);
|
|
return;
|
|
}
|
|
}
|
|
|
|
entry->curItem = entry->list[entry->offset++];
|
|
|
|
/* If we're not past advancePast, keep scanning */
|
|
if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
|
|
continue;
|
|
|
|
/* Done unless we need to reduce the result */
|
|
if (!entry->reduceResult || !dropItem(entry))
|
|
break;
|
|
|
|
/*
|
|
* Advance advancePast (so that entryLoadMoreItems will load the
|
|
* right data), and keep scanning
|
|
*/
|
|
advancePast = entry->curItem;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Identify the "current" item among the input entry streams for this scan key
|
|
* that is greater than advancePast, and test whether it passes the scan key
|
|
* qual condition.
|
|
*
|
|
* The current item is the smallest curItem among the inputs. key->curItem
|
|
* is set to that value. key->curItemMatches is set to indicate whether that
|
|
* TID passes the consistentFn test. If so, key->recheckCurItem is set true
|
|
* iff recheck is needed for this item pointer (including the case where the
|
|
* item pointer is a lossy page pointer).
|
|
*
|
|
* If all entry streams are exhausted, sets key->isFinished to true.
|
|
*
|
|
* Item pointers must be returned in ascending order.
|
|
*
|
|
* Note: this can return a "lossy page" item pointer, indicating that the
|
|
* key potentially matches all items on that heap page. However, it is
|
|
* not allowed to return both a lossy page pointer and exact (regular)
|
|
* item pointers for the same page. (Doing so would break the key-combination
|
|
* logic in scanGetItem.)
|
|
*/
|
|
static void
|
|
keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
|
ItemPointerData advancePast)
|
|
{
|
|
ItemPointerData minItem;
|
|
ItemPointerData curPageLossy;
|
|
uint32 i;
|
|
bool haveLossyEntry;
|
|
GinScanEntry entry;
|
|
GinTernaryValue res;
|
|
MemoryContext oldCtx;
|
|
bool allFinished;
|
|
|
|
Assert(!key->isFinished);
|
|
|
|
/*
|
|
* We might have already tested this item; if so, no need to repeat work.
|
|
* (Note: the ">" case can happen, if advancePast is exact but we
|
|
* previously had to set curItem to a lossy-page pointer.)
|
|
*/
|
|
if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
|
|
return;
|
|
|
|
/*
|
|
* Find the minimum item > advancePast among the active entry streams.
|
|
*
|
|
* Note: a lossy-page entry is encoded by a ItemPointer with max value for
|
|
* offset (0xffff), so that it will sort after any exact entries for the
|
|
* same page. So we'll prefer to return exact pointers not lossy
|
|
* pointers, which is good.
|
|
*/
|
|
ItemPointerSetMax(&minItem);
|
|
allFinished = true;
|
|
for (i = 0; i < key->nrequired; i++)
|
|
{
|
|
entry = key->requiredEntries[i];
|
|
|
|
if (entry->isFinished)
|
|
continue;
|
|
|
|
/*
|
|
* Advance this stream if necessary.
|
|
*
|
|
* In particular, since entry->curItem was initialized with
|
|
* ItemPointerSetMin, this ensures we fetch the first item for each
|
|
* entry on the first call.
|
|
*/
|
|
if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
|
|
{
|
|
entryGetItem(ginstate, entry, advancePast);
|
|
if (entry->isFinished)
|
|
continue;
|
|
}
|
|
|
|
allFinished = false;
|
|
if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
|
|
minItem = entry->curItem;
|
|
}
|
|
|
|
if (allFinished && !key->excludeOnly)
|
|
{
|
|
/* all entries are finished */
|
|
key->isFinished = true;
|
|
return;
|
|
}
|
|
|
|
if (!key->excludeOnly)
|
|
{
|
|
/*
|
|
* For a normal scan key, we now know there are no matches < minItem.
|
|
*
|
|
* If minItem is lossy, it means that there were no exact items on the
|
|
* page among requiredEntries, because lossy pointers sort after exact
|
|
* items. However, there might be exact items for the same page among
|
|
* additionalEntries, so we mustn't advance past them.
|
|
*/
|
|
if (ItemPointerIsLossyPage(&minItem))
|
|
{
|
|
if (GinItemPointerGetBlockNumber(&advancePast) <
|
|
GinItemPointerGetBlockNumber(&minItem))
|
|
{
|
|
ItemPointerSet(&advancePast,
|
|
GinItemPointerGetBlockNumber(&minItem),
|
|
InvalidOffsetNumber);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
|
|
ItemPointerSet(&advancePast,
|
|
GinItemPointerGetBlockNumber(&minItem),
|
|
OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* excludeOnly scan keys don't have any entries that are necessarily
|
|
* present in matching items. So, we consider the item just after
|
|
* advancePast.
|
|
*/
|
|
Assert(key->nrequired == 0);
|
|
ItemPointerSet(&minItem,
|
|
GinItemPointerGetBlockNumber(&advancePast),
|
|
OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
|
|
}
|
|
|
|
/*
|
|
* We might not have loaded all the entry streams for this TID yet. We
|
|
* could call the consistent function, passing MAYBE for those entries, to
|
|
* see if it can decide if this TID matches based on the information we
|
|
* have. But if the consistent-function is expensive, and cannot in fact
|
|
* decide with partial information, that could be a big loss. So, load all
|
|
* the additional entries, before calling the consistent function.
|
|
*/
|
|
for (i = 0; i < key->nadditional; i++)
|
|
{
|
|
entry = key->additionalEntries[i];
|
|
|
|
if (entry->isFinished)
|
|
continue;
|
|
|
|
if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
|
|
{
|
|
entryGetItem(ginstate, entry, advancePast);
|
|
if (entry->isFinished)
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Normally, none of the items in additionalEntries can have a curItem
|
|
* larger than minItem. But if minItem is a lossy page, then there
|
|
* might be exact items on the same page among additionalEntries.
|
|
*/
|
|
if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
|
|
{
|
|
Assert(ItemPointerIsLossyPage(&minItem));
|
|
minItem = entry->curItem;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Ok, we've advanced all the entries up to minItem now. Set key->curItem,
|
|
* and perform consistentFn test.
|
|
*
|
|
* Lossy-page entries pose a problem, since we don't know the correct
|
|
* entryRes state to pass to the consistentFn, and we also don't know what
|
|
* its combining logic will be (could be AND, OR, or even NOT). If the
|
|
* logic is OR then the consistentFn might succeed for all items in the
|
|
* lossy page even when none of the other entries match.
|
|
*
|
|
* Our strategy is to call the tri-state consistent function, with the
|
|
* lossy-page entries set to MAYBE, and all the other entries FALSE. If it
|
|
* returns FALSE, none of the lossy items alone are enough for a match, so
|
|
* we don't need to return a lossy-page pointer. Otherwise, return a
|
|
* lossy-page pointer to indicate that the whole heap page must be
|
|
* checked. (On subsequent calls, we'll do nothing until minItem is past
|
|
* the page altogether, thus ensuring that we never return both regular
|
|
* and lossy pointers for the same page.)
|
|
*
|
|
* An exception is that it doesn't matter what we pass for lossy pointers
|
|
* in "hidden" entries, because the consistentFn's result can't depend on
|
|
* them. We could pass them as MAYBE as well, but if we're using the
|
|
* "shim" implementation of a tri-state consistent function (see
|
|
* ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
|
|
* them as true.
|
|
*
|
|
* Note that only lossy-page entries pointing to the current item's page
|
|
* should trigger this processing; we might have future lossy pages in the
|
|
* entry array, but they aren't relevant yet.
|
|
*/
|
|
key->curItem = minItem;
|
|
ItemPointerSetLossyPage(&curPageLossy,
|
|
GinItemPointerGetBlockNumber(&key->curItem));
|
|
haveLossyEntry = false;
|
|
for (i = 0; i < key->nentries; i++)
|
|
{
|
|
entry = key->scanEntry[i];
|
|
if (entry->isFinished == false &&
|
|
ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
|
|
{
|
|
if (i < key->nuserentries)
|
|
key->entryRes[i] = GIN_MAYBE;
|
|
else
|
|
key->entryRes[i] = GIN_TRUE;
|
|
haveLossyEntry = true;
|
|
}
|
|
else
|
|
key->entryRes[i] = GIN_FALSE;
|
|
}
|
|
|
|
/* prepare for calling consistentFn in temp context */
|
|
oldCtx = MemoryContextSwitchTo(tempCtx);
|
|
|
|
if (haveLossyEntry)
|
|
{
|
|
/* Have lossy-page entries, so see if whole page matches */
|
|
res = key->triConsistentFn(key);
|
|
|
|
if (res == GIN_TRUE || res == GIN_MAYBE)
|
|
{
|
|
/* Yes, so clean up ... */
|
|
MemoryContextSwitchTo(oldCtx);
|
|
MemoryContextReset(tempCtx);
|
|
|
|
/* and return lossy pointer for whole page */
|
|
key->curItem = curPageLossy;
|
|
key->curItemMatches = true;
|
|
key->recheckCurItem = true;
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* At this point we know that we don't need to return a lossy whole-page
|
|
* pointer, but we might have matches for individual exact item pointers,
|
|
* possibly in combination with a lossy pointer. Pass lossy pointers as
|
|
* MAYBE to the ternary consistent function, to let it decide if this
|
|
* tuple satisfies the overall key, even though we don't know if the lossy
|
|
* entries match.
|
|
*
|
|
* Prepare entryRes array to be passed to consistentFn.
|
|
*/
|
|
for (i = 0; i < key->nentries; i++)
|
|
{
|
|
entry = key->scanEntry[i];
|
|
if (entry->isFinished)
|
|
key->entryRes[i] = GIN_FALSE;
|
|
#if 0
|
|
|
|
/*
|
|
* This case can't currently happen, because we loaded all the entries
|
|
* for this item earlier.
|
|
*/
|
|
else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
|
|
key->entryRes[i] = GIN_MAYBE;
|
|
#endif
|
|
else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
|
|
key->entryRes[i] = GIN_MAYBE;
|
|
else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
|
|
key->entryRes[i] = GIN_TRUE;
|
|
else
|
|
key->entryRes[i] = GIN_FALSE;
|
|
}
|
|
|
|
res = key->triConsistentFn(key);
|
|
|
|
switch (res)
|
|
{
|
|
case GIN_TRUE:
|
|
key->curItemMatches = true;
|
|
/* triConsistentFn set recheckCurItem */
|
|
break;
|
|
|
|
case GIN_FALSE:
|
|
key->curItemMatches = false;
|
|
break;
|
|
|
|
case GIN_MAYBE:
|
|
key->curItemMatches = true;
|
|
key->recheckCurItem = true;
|
|
break;
|
|
|
|
default:
|
|
|
|
/*
|
|
* the 'default' case shouldn't happen, but if the consistent
|
|
* function returns something bogus, this is the safe result
|
|
*/
|
|
key->curItemMatches = true;
|
|
key->recheckCurItem = true;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* We have a tuple, and we know if it matches or not. If it's a non-match,
|
|
* we could continue to find the next matching tuple, but let's break out
|
|
* and give scanGetItem a chance to advance the other keys. They might be
|
|
* able to skip past to a much higher TID, allowing us to save work.
|
|
*/
|
|
|
|
/* clean up after consistentFn calls */
|
|
MemoryContextSwitchTo(oldCtx);
|
|
MemoryContextReset(tempCtx);
|
|
}
|
|
|
|
/*
|
|
* Get next heap item pointer (after advancePast) from scan.
|
|
* Returns true if anything found.
|
|
* On success, *item and *recheck are set.
|
|
*
|
|
* Note: this is very nearly the same logic as in keyGetItem(), except
|
|
* that we know the keys are to be combined with AND logic, whereas in
|
|
* keyGetItem() the combination logic is known only to the consistentFn.
|
|
*/
|
|
static bool
|
|
scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
|
|
ItemPointerData *item, bool *recheck)
|
|
{
|
|
GinScanOpaque so = (GinScanOpaque) scan->opaque;
|
|
uint32 i;
|
|
bool match;
|
|
|
|
/*----------
|
|
* Advance the scan keys in lock-step, until we find an item that matches
|
|
* all the keys. If any key reports isFinished, meaning its subset of the
|
|
* entries is exhausted, we can stop. Otherwise, set *item to the next
|
|
* matching item.
|
|
*
|
|
* This logic works only if a keyGetItem stream can never contain both
|
|
* exact and lossy pointers for the same page. Else we could have a
|
|
* case like
|
|
*
|
|
* stream 1 stream 2
|
|
* ... ...
|
|
* 42/6 42/7
|
|
* 50/1 42/0xffff
|
|
* ... ...
|
|
*
|
|
* We would conclude that 42/6 is not a match and advance stream 1,
|
|
* thus never detecting the match to the lossy pointer in stream 2.
|
|
* (keyGetItem has a similar problem versus entryGetItem.)
|
|
*----------
|
|
*/
|
|
do
|
|
{
|
|
ItemPointerSetMin(item);
|
|
match = true;
|
|
for (i = 0; i < so->nkeys && match; i++)
|
|
{
|
|
GinScanKey key = so->keys + i;
|
|
|
|
/*
|
|
* If we're considering a lossy page, skip excludeOnly keys. They
|
|
* can't exclude the whole page anyway.
|
|
*/
|
|
if (ItemPointerIsLossyPage(item) && key->excludeOnly)
|
|
{
|
|
/*
|
|
* ginNewScanKey() should never mark the first key as
|
|
* excludeOnly.
|
|
*/
|
|
Assert(i > 0);
|
|
continue;
|
|
}
|
|
|
|
/* Fetch the next item for this key that is > advancePast. */
|
|
keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
|
|
|
|
if (key->isFinished)
|
|
return false;
|
|
|
|
/*
|
|
* If it's not a match, we can immediately conclude that nothing
|
|
* <= this item matches, without checking the rest of the keys.
|
|
*/
|
|
if (!key->curItemMatches)
|
|
{
|
|
advancePast = key->curItem;
|
|
match = false;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* It's a match. We can conclude that nothing < matches, so the
|
|
* other key streams can skip to this item.
|
|
*
|
|
* Beware of lossy pointers, though; from a lossy pointer, we can
|
|
* only conclude that nothing smaller than this *block* matches.
|
|
*/
|
|
if (ItemPointerIsLossyPage(&key->curItem))
|
|
{
|
|
if (GinItemPointerGetBlockNumber(&advancePast) <
|
|
GinItemPointerGetBlockNumber(&key->curItem))
|
|
{
|
|
ItemPointerSet(&advancePast,
|
|
GinItemPointerGetBlockNumber(&key->curItem),
|
|
InvalidOffsetNumber);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
|
|
ItemPointerSet(&advancePast,
|
|
GinItemPointerGetBlockNumber(&key->curItem),
|
|
OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
|
|
}
|
|
|
|
/*
|
|
* If this is the first key, remember this location as a potential
|
|
* match, and proceed to check the rest of the keys.
|
|
*
|
|
* Otherwise, check if this is the same item that we checked the
|
|
* previous keys for (or a lossy pointer for the same page). If
|
|
* not, loop back to check the previous keys for this item (we
|
|
* will check this key again too, but keyGetItem returns quickly
|
|
* for that)
|
|
*/
|
|
if (i == 0)
|
|
{
|
|
*item = key->curItem;
|
|
}
|
|
else
|
|
{
|
|
if (ItemPointerIsLossyPage(&key->curItem) ||
|
|
ItemPointerIsLossyPage(item))
|
|
{
|
|
Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
|
|
match = (GinItemPointerGetBlockNumber(&key->curItem) ==
|
|
GinItemPointerGetBlockNumber(item));
|
|
}
|
|
else
|
|
{
|
|
Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
|
|
match = (ginCompareItemPointers(&key->curItem, item) == 0);
|
|
}
|
|
}
|
|
}
|
|
} while (!match);
|
|
|
|
Assert(!ItemPointerIsMin(item));
|
|
|
|
/*
|
|
* Now *item contains the first ItemPointer after previous result that
|
|
* satisfied all the keys for that exact TID, or a lossy reference to the
|
|
* same page.
|
|
*
|
|
* We must return recheck = true if any of the keys are marked recheck.
|
|
*/
|
|
*recheck = false;
|
|
for (i = 0; i < so->nkeys; i++)
|
|
{
|
|
GinScanKey key = so->keys + i;
|
|
|
|
if (key->recheckCurItem)
|
|
{
|
|
*recheck = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* Functions for scanning the pending list
|
|
*/
|
|
|
|
|
|
/*
|
|
* Get ItemPointer of next heap row to be checked from pending list.
|
|
* Returns false if there are no more. On pages with several heap rows
|
|
* it returns each row separately, on page with part of heap row returns
|
|
* per page data. pos->firstOffset and pos->lastOffset are set to identify
|
|
* the range of pending-list tuples belonging to this heap row.
|
|
*
|
|
* The pendingBuffer is presumed pinned and share-locked on entry, and is
|
|
* pinned and share-locked on success exit. On failure exit it's released.
|
|
*/
|
|
static bool
|
|
scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
|
|
{
|
|
OffsetNumber maxoff;
|
|
Page page;
|
|
IndexTuple itup;
|
|
|
|
ItemPointerSetInvalid(&pos->item);
|
|
for (;;)
|
|
{
|
|
page = BufferGetPage(pos->pendingBuffer);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
if (pos->firstOffset > maxoff)
|
|
{
|
|
BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
|
|
|
|
if (blkno == InvalidBlockNumber)
|
|
{
|
|
UnlockReleaseBuffer(pos->pendingBuffer);
|
|
pos->pendingBuffer = InvalidBuffer;
|
|
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Here we must prevent deletion of next page by insertcleanup
|
|
* process, which may be trying to obtain exclusive lock on
|
|
* current page. So, we lock next page before releasing the
|
|
* current one
|
|
*/
|
|
Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
|
|
|
|
LockBuffer(tmpbuf, GIN_SHARE);
|
|
UnlockReleaseBuffer(pos->pendingBuffer);
|
|
|
|
pos->pendingBuffer = tmpbuf;
|
|
pos->firstOffset = FirstOffsetNumber;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
|
|
pos->item = itup->t_tid;
|
|
if (GinPageHasFullRow(page))
|
|
{
|
|
/*
|
|
* find itempointer to the next row
|
|
*/
|
|
for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
|
|
{
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
|
|
if (!ItemPointerEquals(&pos->item, &itup->t_tid))
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* All itempointers are the same on this page
|
|
*/
|
|
pos->lastOffset = maxoff + 1;
|
|
}
|
|
|
|
/*
|
|
* Now pos->firstOffset points to the first tuple of current heap
|
|
* row, pos->lastOffset points to the first tuple of next heap row
|
|
* (or to the end of page)
|
|
*/
|
|
break;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Scan pending-list page from current tuple (off) up till the first of:
|
|
* - match is found (then returns true)
|
|
* - no later match is possible
|
|
* - tuple's attribute number is not equal to entry's attrnum
|
|
* - reach end of page
|
|
*
|
|
* datum[]/category[]/datumExtracted[] arrays are used to cache the results
|
|
* of gintuple_get_key() on the current page.
|
|
*/
|
|
static bool
|
|
matchPartialInPendingList(GinState *ginstate, Page page,
|
|
OffsetNumber off, OffsetNumber maxoff,
|
|
GinScanEntry entry,
|
|
Datum *datum, GinNullCategory *category,
|
|
bool *datumExtracted)
|
|
{
|
|
IndexTuple itup;
|
|
int32 cmp;
|
|
|
|
/* Partial match to a null is not possible */
|
|
if (entry->queryCategory != GIN_CAT_NORM_KEY)
|
|
return false;
|
|
|
|
while (off < maxoff)
|
|
{
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
|
|
|
|
if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
|
|
return false;
|
|
|
|
if (datumExtracted[off - 1] == false)
|
|
{
|
|
datum[off - 1] = gintuple_get_key(ginstate, itup,
|
|
&category[off - 1]);
|
|
datumExtracted[off - 1] = true;
|
|
}
|
|
|
|
/* Once we hit nulls, no further match is possible */
|
|
if (category[off - 1] != GIN_CAT_NORM_KEY)
|
|
return false;
|
|
|
|
/*----------
|
|
* Check partial match.
|
|
* case cmp == 0 => match
|
|
* case cmp > 0 => not match and end scan (no later match possible)
|
|
* case cmp < 0 => not match and continue scan
|
|
*----------
|
|
*/
|
|
cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
|
|
ginstate->supportCollation[entry->attnum - 1],
|
|
entry->queryKey,
|
|
datum[off - 1],
|
|
UInt16GetDatum(entry->strategy),
|
|
PointerGetDatum(entry->extra_data)));
|
|
if (cmp == 0)
|
|
return true;
|
|
else if (cmp > 0)
|
|
return false;
|
|
|
|
off++;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Set up the entryRes array for each key by looking at
|
|
* every entry for current heap row in pending list.
|
|
*
|
|
* Returns true if each scan key has at least one entryRes match.
|
|
* This corresponds to the situations where the normal index search will
|
|
* try to apply the key's consistentFn. (A tuple not meeting that requirement
|
|
* cannot be returned by the normal search since no entry stream will
|
|
* source its TID.)
|
|
*
|
|
* The pendingBuffer is presumed pinned and share-locked on entry.
|
|
*/
|
|
static bool
|
|
collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
|
|
{
|
|
GinScanOpaque so = (GinScanOpaque) scan->opaque;
|
|
OffsetNumber attrnum;
|
|
Page page;
|
|
IndexTuple itup;
|
|
int i,
|
|
j;
|
|
|
|
/*
|
|
* Reset all entryRes and hasMatchKey flags
|
|
*/
|
|
for (i = 0; i < so->nkeys; i++)
|
|
{
|
|
GinScanKey key = so->keys + i;
|
|
|
|
memset(key->entryRes, GIN_FALSE, key->nentries);
|
|
}
|
|
memset(pos->hasMatchKey, false, so->nkeys);
|
|
|
|
/*
|
|
* Outer loop iterates over multiple pending-list pages when a single heap
|
|
* row has entries spanning those pages.
|
|
*/
|
|
for (;;)
|
|
{
|
|
Datum datum[BLCKSZ / sizeof(IndexTupleData)];
|
|
GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
|
|
bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
|
|
|
|
Assert(pos->lastOffset > pos->firstOffset);
|
|
memset(datumExtracted + pos->firstOffset - 1, 0,
|
|
sizeof(bool) * (pos->lastOffset - pos->firstOffset));
|
|
|
|
page = BufferGetPage(pos->pendingBuffer);
|
|
|
|
for (i = 0; i < so->nkeys; i++)
|
|
{
|
|
GinScanKey key = so->keys + i;
|
|
|
|
for (j = 0; j < key->nentries; j++)
|
|
{
|
|
GinScanEntry entry = key->scanEntry[j];
|
|
OffsetNumber StopLow = pos->firstOffset,
|
|
StopHigh = pos->lastOffset,
|
|
StopMiddle;
|
|
|
|
/* If already matched on earlier page, do no extra work */
|
|
if (key->entryRes[j])
|
|
continue;
|
|
|
|
/*
|
|
* Interesting tuples are from pos->firstOffset to
|
|
* pos->lastOffset and they are ordered by (attnum, Datum) as
|
|
* it's done in entry tree. So we can use binary search to
|
|
* avoid linear scanning.
|
|
*/
|
|
while (StopLow < StopHigh)
|
|
{
|
|
int res;
|
|
|
|
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
|
|
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
|
|
|
|
attrnum = gintuple_get_attrnum(&so->ginstate, itup);
|
|
|
|
if (key->attnum < attrnum)
|
|
{
|
|
StopHigh = StopMiddle;
|
|
continue;
|
|
}
|
|
if (key->attnum > attrnum)
|
|
{
|
|
StopLow = StopMiddle + 1;
|
|
continue;
|
|
}
|
|
|
|
if (datumExtracted[StopMiddle - 1] == false)
|
|
{
|
|
datum[StopMiddle - 1] =
|
|
gintuple_get_key(&so->ginstate, itup,
|
|
&category[StopMiddle - 1]);
|
|
datumExtracted[StopMiddle - 1] = true;
|
|
}
|
|
|
|
if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
|
|
{
|
|
/* special behavior depending on searchMode */
|
|
if (entry->searchMode == GIN_SEARCH_MODE_ALL)
|
|
{
|
|
/* match anything except NULL_ITEM */
|
|
if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
|
|
res = -1;
|
|
else
|
|
res = 0;
|
|
}
|
|
else
|
|
{
|
|
/* match everything */
|
|
res = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
res = ginCompareEntries(&so->ginstate,
|
|
entry->attnum,
|
|
entry->queryKey,
|
|
entry->queryCategory,
|
|
datum[StopMiddle - 1],
|
|
category[StopMiddle - 1]);
|
|
}
|
|
|
|
if (res == 0)
|
|
{
|
|
/*
|
|
* Found exact match (there can be only one, except in
|
|
* EMPTY_QUERY mode).
|
|
*
|
|
* If doing partial match, scan forward from here to
|
|
* end of page to check for matches.
|
|
*
|
|
* See comment above about tuple's ordering.
|
|
*/
|
|
if (entry->isPartialMatch)
|
|
key->entryRes[j] =
|
|
matchPartialInPendingList(&so->ginstate,
|
|
page,
|
|
StopMiddle,
|
|
pos->lastOffset,
|
|
entry,
|
|
datum,
|
|
category,
|
|
datumExtracted);
|
|
else
|
|
key->entryRes[j] = true;
|
|
|
|
/* done with binary search */
|
|
break;
|
|
}
|
|
else if (res < 0)
|
|
StopHigh = StopMiddle;
|
|
else
|
|
StopLow = StopMiddle + 1;
|
|
}
|
|
|
|
if (StopLow >= StopHigh && entry->isPartialMatch)
|
|
{
|
|
/*
|
|
* No exact match on this page. If doing partial match,
|
|
* scan from the first tuple greater than target value to
|
|
* end of page. Note that since we don't remember whether
|
|
* the comparePartialFn told us to stop early on a
|
|
* previous page, we will uselessly apply comparePartialFn
|
|
* to the first tuple on each subsequent page.
|
|
*/
|
|
key->entryRes[j] =
|
|
matchPartialInPendingList(&so->ginstate,
|
|
page,
|
|
StopHigh,
|
|
pos->lastOffset,
|
|
entry,
|
|
datum,
|
|
category,
|
|
datumExtracted);
|
|
}
|
|
|
|
pos->hasMatchKey[i] |= key->entryRes[j];
|
|
}
|
|
}
|
|
|
|
/* Advance firstOffset over the scanned tuples */
|
|
pos->firstOffset = pos->lastOffset;
|
|
|
|
if (GinPageHasFullRow(page))
|
|
{
|
|
/*
|
|
* We have examined all pending entries for the current heap row.
|
|
* Break out of loop over pages.
|
|
*/
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Advance to next page of pending entries for the current heap
|
|
* row. Complain if there isn't one.
|
|
*/
|
|
ItemPointerData item = pos->item;
|
|
|
|
if (scanGetCandidate(scan, pos) == false ||
|
|
!ItemPointerEquals(&pos->item, &item))
|
|
elog(ERROR, "could not find additional pending pages for same heap tuple");
|
|
}
|
|
}
|
|
|
|
/*
|
|
* All scan keys except excludeOnly require at least one entry to match.
|
|
* excludeOnly keys are an exception, because their implied
|
|
* GIN_CAT_EMPTY_QUERY scanEntry always matches. So return "true" if all
|
|
* non-excludeOnly scan keys have at least one match.
|
|
*/
|
|
for (i = 0; i < so->nkeys; i++)
|
|
{
|
|
if (pos->hasMatchKey[i] == false && !so->keys[i].excludeOnly)
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Collect all matched rows from pending list into bitmap.
|
|
*/
|
|
static void
|
|
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
|
{
|
|
GinScanOpaque so = (GinScanOpaque) scan->opaque;
|
|
MemoryContext oldCtx;
|
|
bool recheck,
|
|
match;
|
|
int i;
|
|
pendingPosition pos;
|
|
Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
|
|
Page page;
|
|
BlockNumber blkno;
|
|
|
|
*ntids = 0;
|
|
|
|
/*
|
|
* Acquire predicate lock on the metapage, to conflict with any fastupdate
|
|
* insertions.
|
|
*/
|
|
PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
|
|
|
|
LockBuffer(metabuffer, GIN_SHARE);
|
|
page = BufferGetPage(metabuffer);
|
|
blkno = GinPageGetMeta(page)->head;
|
|
|
|
/*
|
|
* fetch head of list before unlocking metapage. head page must be pinned
|
|
* to prevent deletion by vacuum process
|
|
*/
|
|
if (blkno == InvalidBlockNumber)
|
|
{
|
|
/* No pending list, so proceed with normal scan */
|
|
UnlockReleaseBuffer(metabuffer);
|
|
return;
|
|
}
|
|
|
|
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
|
|
LockBuffer(pos.pendingBuffer, GIN_SHARE);
|
|
pos.firstOffset = FirstOffsetNumber;
|
|
UnlockReleaseBuffer(metabuffer);
|
|
pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
|
|
|
|
/*
|
|
* loop for each heap row. scanGetCandidate returns full row or row's
|
|
* tuples from first page.
|
|
*/
|
|
while (scanGetCandidate(scan, &pos))
|
|
{
|
|
/*
|
|
* Check entries in tuple and set up entryRes array.
|
|
*
|
|
* If pending tuples belonging to the current heap row are spread
|
|
* across several pages, collectMatchesForHeapRow will read all of
|
|
* those pages.
|
|
*/
|
|
if (!collectMatchesForHeapRow(scan, &pos))
|
|
continue;
|
|
|
|
/*
|
|
* Matching of entries of one row is finished, so check row using
|
|
* consistent functions.
|
|
*/
|
|
oldCtx = MemoryContextSwitchTo(so->tempCtx);
|
|
recheck = false;
|
|
match = true;
|
|
|
|
for (i = 0; i < so->nkeys; i++)
|
|
{
|
|
GinScanKey key = so->keys + i;
|
|
|
|
if (!key->boolConsistentFn(key))
|
|
{
|
|
match = false;
|
|
break;
|
|
}
|
|
recheck |= key->recheckCurItem;
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldCtx);
|
|
MemoryContextReset(so->tempCtx);
|
|
|
|
if (match)
|
|
{
|
|
tbm_add_tuples(tbm, &pos.item, 1, recheck);
|
|
(*ntids)++;
|
|
}
|
|
}
|
|
|
|
pfree(pos.hasMatchKey);
|
|
}
|
|
|
|
|
|
#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
|
|
|
|
int64
|
|
gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
|
|
{
|
|
GinScanOpaque so = (GinScanOpaque) scan->opaque;
|
|
int64 ntids;
|
|
ItemPointerData iptr;
|
|
bool recheck;
|
|
|
|
/*
|
|
* Set up the scan keys, and check for unsatisfiable query.
|
|
*/
|
|
ginFreeScanKeys(so); /* there should be no keys yet, but just to be
|
|
* sure */
|
|
ginNewScanKey(scan);
|
|
|
|
if (GinIsVoidRes(scan))
|
|
return 0;
|
|
|
|
ntids = 0;
|
|
|
|
/*
|
|
* First, scan the pending list and collect any matching entries into the
|
|
* bitmap. After we scan a pending item, some other backend could post it
|
|
* into the main index, and so we might visit it a second time during the
|
|
* main scan. This is okay because we'll just re-set the same bit in the
|
|
* bitmap. (The possibility of duplicate visits is a major reason why GIN
|
|
* can't support the amgettuple API, however.) Note that it would not do
|
|
* to scan the main index before the pending list, since concurrent
|
|
* cleanup could then make us miss entries entirely.
|
|
*/
|
|
scanPendingInsert(scan, tbm, &ntids);
|
|
|
|
/*
|
|
* Now scan the main index.
|
|
*/
|
|
startScan(scan);
|
|
|
|
ItemPointerSetMin(&iptr);
|
|
|
|
for (;;)
|
|
{
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
if (!scanGetItem(scan, iptr, &iptr, &recheck))
|
|
break;
|
|
|
|
if (ItemPointerIsLossyPage(&iptr))
|
|
tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
|
|
else
|
|
tbm_add_tuples(tbm, &iptr, 1, recheck);
|
|
ntids++;
|
|
}
|
|
|
|
return ntids;
|
|
}
|